In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, DoubleType, IntegerType
from pyspark.sql.functions import col, struct, explode, when, lit, array_max, array, split, regexp_replace



In [9]:
# variant_srcdir = 's3://dig-analysis-data/out/varianteffect/common/part-*'
# outdir = 's3:/dig-analysis-data/out/varianteffect/magma/'

# development localhost directories
phenotype_srcdir = '/Users/mduby/Data/Broad/Magma/Phenotype/*/part-*'
variant_srcdir = '/Users/mduby/Data/Broad/Magma/Common/part*'
out_dir = '/Users/mduby/Data/Broad/Magma/Out/Step2'

# print
print("the variant input directory is: {}".format(variant_srcdir))
print("the phenotype input directory is: {}".format(phenotype_srcdir))
print("the output directory is: {}".format(out_dir))


the variant input directory is: /Users/mduby/Data/Broad/Magma/Common/part*
the phenotype input directory is: /Users/mduby/Data/Broad/Magma/Phenotype/*/part-*
the output directory is: /Users/mduby/Data/Broad/Magma/Out/Step2


In [7]:
phenotype_schema = StructType(
    [
        StructField('varId', StringType(), nullable=False),
        StructField('chromosome', StringType(), nullable=False),
        StructField('position', IntegerType(), nullable=False),
        StructField('reference', StringType(), nullable=False),
        StructField('alt', StringType(), nullable=False),
        StructField('phenotype', StringType(), nullable=False),
        StructField('pValue', DoubleType(), nullable=False),
        StructField('beta', DoubleType(), nullable=False),
        StructField('zScore', DoubleType(), nullable=False),
        StructField('stdErr', DoubleType(), nullable=False),
        StructField('n', DoubleType(), nullable=False),
        StructField('top', BooleanType(), nullable=False),
    ]
)

# this is the schema for the common variant file
variant_schema = StructType(
    [
        StructField('varId', StringType(), nullable=False),
        StructField('dbSNP', StringType(), nullable=False),
        StructField('consequence', StringType(), nullable=False),
        StructField('gene', StringType(), nullable=False),
        StructField('transcript', StringType(), nullable=False),
        StructField('impact', StringType(), nullable=False),
    ]
)



In [10]:
# %%
# open spark session
spark = SparkSession.builder.appName('bioindex').getOrCreate()

print("got Spark session of type {}".format(type(spark)))



got Spark session of type <class 'pyspark.sql.session.SparkSession'>


In [13]:
# load the variants
df_variant_load = spark.read.csv(variant_srcdir, sep='\t', header=True, schema=variant_schema).select('varId', 'dbSNP')

# print
print("the loaded variant data frame has {} rows".format(df_variant_load.count()))
df_variant_load.show(5)



the loaded variant data frame has 67003328 rows
+---------------+-----------+
|          varId|      dbSNP|
+---------------+-----------+
|1:62185338:AT:A|       null|
| 1:62190015:G:T|rs147606427|
| 1:62190786:C:T|       null|
| 1:62192716:A:G|rs185779444|
| 1:62197508:T:C|rs114282349|
+---------------+-----------+
only showing top 5 rows



In [17]:
# keep only the rows with non null dbSNP ids
df_variant_load = df_variant_load.filter(col("dbSNP").isNotNull())

# print
print("the non null RS id variant dataframe has {} rows".format(df_variant_load.count()))
df_variant_load.show(5)


the non null RS id variant dataframe has 59203797 rows
+--------------+-----------+
|         varId|      dbSNP|
+--------------+-----------+
|1:62190015:G:T|rs147606427|
|1:62192716:A:G|rs185779444|
|1:62197508:T:C|rs114282349|
|1:62204697:C:G|rs554118633|
|1:62204701:G:T|rs575749077|
+--------------+-----------+
only showing top 5 rows



In [14]:
# load the phenotypes
df_phenotype_load = spark.read.csv(phenotype_srcdir, sep='\t', header=True, schema=phenotype_schema).select('varId', 'phenotype', 'pValue', 'n')

# print
print("the loaded phenotype data frame has {} rows".format(df_phenotype_load.count()))
df_phenotype_load.show(5)


the loaded phenotype data frame has 60745157 rows
+------------------+---------+------+--------+
|             varId|phenotype|pValue|       n|
+------------------+---------+------+--------+
|  10:100008663:T:C|      T2D| 0.666|  6104.0|
|  10:100009881:A:G|      T2D|0.1209|  4347.0|
|   10:10001222:C:A|      T2D|  0.45|102677.0|
|  10:100013493:C:T|      T2D|0.9751| 29076.0|
|10:100028288:G:GCA|      T2D|0.1559| 96318.0|
+------------------+---------+------+--------+
only showing top 5 rows



In [18]:
# join the dbSNP id with the associations
df_joined = df_variant_load.join(df_phenotype_load, 'varId', how='inner')

# print
print("the loaded joined data frame has {} rows".format(df_joined.count()))
df_joined.show(5)


the loaded joined data frame has 48149817 rows
+----------------+-----------+---------+------+--------+
|           varId|      dbSNP|phenotype|pValue|       n|
+----------------+-----------+---------+------+--------+
|10:100005684:C:G|rs145568860|      T2D|  0.49|442817.0|
|10:100016209:A:G|rs193028079|      T2D|  0.69|110767.0|
|10:100044045:T:C|rs555032946|      T2D| 0.928|191764.0|
|10:100061414:T:C| rs76589250|      T2D|0.5835|  4347.0|
|10:100108558:G:A|rs139361990|      T2D|  0.23|442817.0|
+----------------+-----------+---------+------+--------+
only showing top 5 rows



In [20]:
# get the distinct phenotypes
df_unique_phenotype = df_joined.select('phenotype').distinct().rdd.map(lambda r: r[0]).collect()

print("the list of phenotypes is {}".format(df_unique_phenotype))

the list of phenotypes is ['T2D', 'AF']
