# Using Hail with Firecloud data

### Here we'll go through how we can access a Firecloud workspace, manipulate the data model and explore some aspects of the workspace data in Hail.

### First, load some useful tools.

In [1]:
%%capture
! pip install firecloud
from firecloud import fiss
import pandas as pd
pd.set_option('display.max_row', 10000)
import io
import numpy as np
from pprint import pprint


### Next, see what kind of permissions we have on the workspace.

In [2]:
control = fiss.fapi.get_workspace_acl("topmed-shared", "topmed-shared").json()#['acl']
print control['message']


insufficient permissions to perform operation on topmed-shared/topmed-shared


### Since we are not an owner, we'll get denied when trying to see who else is part of the workspace. But, since we are a writer, this does not stop us from working with the data. 

### We can look at how many samples are participants we have within the Firecloud data model.

In [None]:
ent_types = fiss.fapi.list_entity_types("topmed-shared", "topmed-shared").json()
for t in ent_types.keys():
    print t, "count:", ent_types[t]['count']

### Take a look at the data model

In [None]:
samples = fiss.fapi.get_entities("topmed-shared","topmed-shared", "sample").json()
sample_tups = [(samples[i]['attributes']['participant']['entityName'],samples[i]['attributes']['study']) for i in xrange(0,len(samples))]
pid_to_study = dict()
for e in sample_tups:
    pid_to_study.setdefault(e[1], []).append(e[0])

    num_dict = {d:len(pid_to_study[d]) for d in pid_to_study}
print 'Included studies(# participants):\n', '\n'.join([d+'('+str(num_dict[d])+')' for d in num_dict])

### Start Hail.

In [4]:
## start hail context
from hail import *
hc = HailContext()

### The path to the genotype file is also in the Firecloud data model. Next we'll parse this information out and load the file.

In [None]:
samples_sets = fiss.fapi.get_entities('topmed-shared','topmed-shared', 'sample_set').json()
print 'Sample set name:', samples_sets[0]['name']
print 'Sample set fields:', ', '.join(samples_sets[0]['attributes'].keys())
vcf_files = samples_sets[0]['attributes']['vcf']['items']
print '# of vcf files:', len(vcf_files)

### We may want to actually add the data contained in the data model to the genotype file as an annotation. This can be done with Hail but requires some manipulation.

In [61]:
data_model = fiss.fapi.get_entities_tsv("topmed-shared","topmed-shared", "sample")
data_model_text = pd.read_csv(io.StringIO(data_model.text), sep='\t')[['entity:sample_id','participant','CENTER','study','topmed_project']]
data_model_text.rename(columns = {'entity:sample_id':'ent_sample_id', 'participant':'sample_id'}, inplace = True)
data_model_text[['study', 'topmed_project']] = data_model_text[['study', 'topmed_project']].astype(str)
data_model_text.info()

from pyspark.sql import SQLContext
sqlctx = SQLContext(hc.sc)
spark_df = sqlctx.createDataFrame(data_model_text)
kt = KeyTable.from_dataframe(spark_df,key='sample_id') 
vds = vds.annotate_samples_table(kt, root='sa')

data_model_text.head()

Unnamed: 0,participant,FAMID,sex,T2D,T2D_AGE
0,100,100,1,1,70.99
1,1000-10D,1000-10D,1,1,54.0
2,1001,1001,2,0,63.8
3,1001-10D,1001-10D,1,1,55.0
4,1002-10D,1002-10D,1,1,60.0


### Take a look at the sample scheme of our data to see what we have added.

In [62]:
pprint(vds.sample_schema)

Struct{
     FAMID: String,
     sex: Long,
     T2D: Long,
     T2D_AGE: Double
 }


### We can do the same counting samples per cohort as above with Hail.

In [64]:
pprint(vds.query_samples('samples.map(s => sa.topmed_project).counter()'))

{0L: 1408L, 1L: 1466L}


### For another level of annotation, we can run the built in QC from Hail.

In [65]:
vds = vds.variant_qc().cache().sample_qc()

### Check the sample and variant scheme to see what we have added.

In [66]:
pprint(vds.sample_schema)

Struct{
     FAMID: String,
     sex: Long,
     T2D: Long,
     T2D_AGE: Double,
     qc: Struct{
         callRate: Double,
         nCalled: Int,
         nNotCalled: Int,
         nHomRef: Int,
         nHet: Int,
         nHomVar: Int,
         nSNP: Int,
         nInsertion: Int,
         nDeletion: Int,
         nSingleton: Int,
         nTransition: Int,
         nTransversion: Int,
         dpMean: Double,
         dpStDev: Double,
         gqMean: Double,
         gqStDev: Double,
         nNonRef: Int,
         rTiTv: Double,
         rHetHomVar: Double,
         rInsertionDeletion: Double
     }
 }


In [67]:
pprint(vds.variant_schema)

Struct{
     rsid: String,
     qual: Double,
     filters: Set[String],
     info: Struct{
         AC: Array[Int],
         AN: Int,
         SOURCE: Array[String],
         VT: Array[String],
         LDAF: Double,
         AVGPOST: Double,
         RSQ: Double,
         ERATE: Double,
         THETA: Double,
         CIEND: Array[Int],
         CIPOS: Array[Int],
         END: Int
     },
     qc: Struct{
         callRate: Double,
         AC: Int,
         AF: Double,
         nCalled: Int,
         nNotCalled: Int,
         nHomRef: Int,
         nHet: Int,
         nHomVar: Int,
         dpMean: Double,
         dpStDev: Double,
         gqMean: Double,
         gqStDev: Double,
         nNonRef: Int,
         rHeterozygosity: Double,
         rHetHomVar: Double,
         rExpectedHetFrequency: Double,
         pHWE: Double
     }
 }


### We can also use Hail to further annotate and get some useful statistics, per cohort, from our data.

In [68]:
## actually want to look at proportion per cohort of singletons/doubles/etc

# annotate by allele count and frequency for each sample
vds = vds.annotate_samples_expr('sa.nDoubles = gs.filter(g => g.isHet() && va.qc.AC == 2).count()')
vds = vds.annotate_samples_expr('sa.nTri_to_one = gs.filter(g => g.isHet() && va.qc.AC == 3).count()')
vds = vds.annotate_samples_expr('sa.nOne = gs.filter(g => g.isHet() && va.qc.AF < 0.01 && va.qc.AF > 0.001).count()')
vds = vds.annotate_samples_expr('sa.nTen = gs.filter(g => g.isHet() && va.qc.AF < 0.1 && va.qc.AF > 0.01).count()')
vds = vds.annotate_samples_expr('sa.nTen_above = gs.filter(g => g.isHet() && va.qc.AF > 0.1).count()')



Unnamed: 0,Pop,Singletons,Doubletons,Tripletons_to_01,Zero_1_to_1,One_to_10,Ten_above
0,1,220921.0,121000.0,88768.0,2458273.0,20264868.0,113899937.0
1,0,183990.0,107430.0,82624.0,2345580.0,19400797.0,109295707.0


In [None]:
(vds.samples_table()
 .aggregate_by_key(key_expr=['Pop = sa.topmed_project'], agg_expr=['Singletons = sa.map(s => sa.qc.nSingleton).stats().sum',
                                                                                          'Doubletons = sa.map(s => sa.nDoubles).stats().sum',
                                                                                          'Tripletons_to_01 = sa.map(sa => sa.nTri_to_one).stats().sum',
                                                                                          'Zero_1_to_1 = sa.map(sa => sa.nOne).stats().sum',
                                                                                          'One_to_10 = sa.map(sa => sa.nTen).stats().sum',
                                                                                          'Ten_above = sa.map(sa => sa.nTen_above).stats().sum']).to_pandas()
)

### Some more statistics on our >10% frequency variants.

In [69]:
vds.samples_table().aggregate_by_key(key_expr=['Pop = sa.T2D'], agg_expr=['Ten = sa.map(s => sa.nTen_above).stats()']).to_pandas()

Unnamed: 0,Pop,Ten.mean,Ten.stdev,Ten.min,Ten.max,Ten.nNotMissing,Ten.sum
0,1,77694.363574,6482.860547,43951.0,96586.0,1466,113899937.0
1,0,77624.791903,6225.905044,54558.0,95784.0,1408,109295707.0


### Running PCA on the genotype data

In [None]:
# do pca
pca = vds.pca('sa.pca', k=5, eigenvalues='global.eigen')

### Visualize the differences in families within the studies by principle components.

In [None]:
# define some colors
c = ["aec7e8","ff7f0e","ffbb78","2ca02c","98df8a","d62728","ff9896","9467bd","c5b0d5","8c564b","c49c94"]
cz = zip(vds.query_samples('samples.map(s => sa.topmed_project).counter()').keys(),c)
colors = {t[0]: t[1] for t in cz}
# show the pca results
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import Counter
from math import log, isnan
from pprint import pprint
%matplotlib inline
pca_table = pca.samples_table().to_pandas()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"],
            c = pca_table["sa.metadata.SuperPopulation"].map(colors),
            alpha = .5)
plt.xlim(-0.6, 0.6)
plt.xlabel("PC1")
plt.ylabel("PC2")
legend_entries = [mpatches.Patch(color=c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()