In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, DoubleType, IntegerType
from pyspark.sql.functions import col, struct, explode



In [7]:
# load and output directory
# vep_srcdir = 's3://dig-analysis-data/out/varianteffect/effects/part-*'
# outdir = 's3://dig-bio-index/burden/vepbinning'

# test directories
vep_srcdir = '/Users/mduby/Data/Broad/Aggregator/BurdenBinning/test*'
outdir = '/Users/mduby/Data/Broad/Aggregator/BurdenBinning/Out'




In [23]:
# constants
filter_impact = "impact"
filter_pick = "pick"
filter_polyphen2_hdiv_pred = "polyphen2_hdiv_pred"
filter_polyphen2_hvar_pred = "polyphen2_hvar_pred"
filter_sift_red = "sift_pred"
filter_mutationtaster_pred = "mutationtaster_pred"
filter_lrt_pred = "lrt_pred"
filter_metalr_pred = "metalr_pred"
filter_provean_pred = "provean_pred"
filter_fathmm_pred = "fathmm_pred"
filter_fathmm_mkl_coding_pred = "fathmm-mkl_coding_pred"
filter_eigen_pc_raw_rankscore = "eigen-pc-raw_rankscore"
filter_dann_rankscore = "dann_rankscore"
filter_vest3_rankscore = "vest3_rankscore"
filter_cadd_raw_rankscore = "cadd_raw_rankscore"
filter_metasvm_pred = "metasvm_pred"




In [2]:
# variant list schema
all_schema = StructType(
    [
        StructField('gene_name', StringType(), nullable=False),
        StructField('gene_ensemble_id', StringType(), nullable=False),
        StructField('chromosome', StringType(), nullable=False),
        StructField('position', IntegerType(), nullable=False),
        StructField('burdenBinId', IntegerType(), nullable=False),
        StructField('varId', StringType(), nullable=False),
    ]
)

In [3]:
# open spark session
spark = SparkSession.builder.appName('bioindex').getOrCreate()


In [9]:
# load the json data
vep = spark.read.json(vep_srcdir)

# print
print("the loaded test data is:")
format(vep.show())


+-------------+-------------+---------+-----------------+--------------------+-----------------------+-----------------------+-------------------------------+---------------+---------+------+-----------------------+
|allele_string|assembly_name|      end|               id|               input|intergenic_consequences|most_severe_consequence|regulatory_feature_consequences|seq_region_name|    start|strand|transcript_consequences|
+-------------+-------------+---------+-----------------+--------------------+-----------------------+-----------------------+-------------------------------+---------------+---------+------+-----------------------+
|          A/G|       GRCh37|100011334| 10:100011334:A:G|10	100011334	1000...|                   null|       missense_variant|                           null|             10|100011334|     1|   [[Y/H, protein_co...|
|          G/A|       GRCh37|100015480| 10:100015480:G:A|10	100015480	1000...|                   null|       missense_variant|          

In [25]:
# create new data frame with only var id
transcript_consequences = vep.select(vep.id, vep.transcript_consequences) \
    .withColumn('cqs', explode(col('transcript_consequences'))) \
    .filter(col('cqs.pick') == 1) \
    .select(
        col('id').alias('varId'),
        struct('cqs.gene_id').alias('genEnsembleId'),
        struct('cqs.' + filter_pick).alias(filter_pick),
        struct('cqs.' + filter_impact).alias(filter_impact),

        struct('cqs.' + filter_polyphen2_hdiv_pred).alias(filter_polyphen2_hdiv_pred),
        struct('cqs.' + filter_polyphen2_hvar_pred).alias(filter_polyphen2_hvar_pred),
        struct('cqs.' + filter_sift_red).alias(filter_sift_red),
        struct('cqs.' + filter_mutationtaster_pred).alias(filter_mutationtaster_pred),
        struct('cqs.' + filter_lrt_pred).alias(filter_lrt_pred),
        struct('cqs.' + filter_metalr_pred).alias(filter_metalr_pred),

        struct('cqs.' + filter_provean_pred).alias(filter_provean_pred),
        struct('cqs.' + filter_fathmm_pred).alias(filter_fathmm_pred),
        struct('cqs.' + filter_fathmm_mkl_coding_pred).alias(filter_fathmm_mkl_coding_pred),
        struct('cqs.' + filter_eigen_pc_raw_rankscore).alias(filter_eigen_pc_raw_rankscore),
        struct('cqs.' + filter_dann_rankscore).alias(filter_dann_rankscore),
        struct('cqs.' + filter_vest3_rankscore).alias(filter_vest3_rankscore),
        struct('cqs.' + filter_cadd_raw_rankscore).alias(filter_cadd_raw_rankscore),
        struct('cqs.' + filter_metasvm_pred).alias(filter_metasvm_pred),
    )

# filter_polyphen2_hdiv_pred = "polyphen2_hdiv_pred"
# filter_polyphen2_hvar_pred = "polyphen2_hvar_pred"
# filter_sift_red  "sift_pred"
# filter_mutationtaster_pred = "mutationtaster_pred"
# filter_lrt_pred = "lrt_pred"
# filter_metalr_pred = "metalr_pred"

# filter_provean_pred = "provean_pred"
# filter_fathmm_pred = "fathmm_pred"
# filter_fathmm_mkl_coding_pred = "fathmm-mkl_coding_pred"
# filter_eigen_pc_raw_rankscore = "eigen-pc-raw_rankscore"
# filter_dann_rankscore = "dann_rankscore"
# filter_vest3_rankscore = "vest3_rankscore"
# filter_cadd_raw_rankscore = "cadd_raw_rankscore"
# filter_metasvm_pred = "metasvm_pred"


# print
print("the filtered test data is:")
transcript_consequences.show()



the filtered test data is:
+-------------------+-----------------+----+----------+-------------------+-------------------+---------+-------------------+--------+-----------+------------+-----------+----------------------+----------------------+--------------+---------------+------------------+------------+
|              varId|    genEnsembleId|pick|    impact|polyphen2_hdiv_pred|polyphen2_hvar_pred|sift_pred|mutationtaster_pred|lrt_pred|metalr_pred|provean_pred|fathmm_pred|fathmm-mkl_coding_pred|eigen-pc-raw_rankscore|dann_rankscore|vest3_rankscore|cadd_raw_rankscore|metasvm_pred|
+-------------------+-----------------+----+----------+-------------------+-------------------+---------+-------------------+--------+-----------+------------+-----------+----------------------+----------------------+--------------+---------------+------------------+------------+
|   10:100011334:A:G|[ENSG00000138131]| [1]|[MODERATE]|                [D]|                [D]|      [D]|                [D]|     