In [1]:
# imports
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, DoubleType, IntegerType
from pyspark.sql.functions import col, struct, explode, when, lit, array, udf
import pyspark as pysp 

print("PySpark version is {}".format(pysp.__version__))


PySpark version is 2.4.5


In [17]:
# directory location
freq_srcdir = '/Users/mduby/Data/Broad/Aggregator/BurdenBinning/JsonFrequency'
freq_srcdir_cvs = '/Users/mduby/Data/Broad/Aggregator/BurdenBinning/Frequency'


In [3]:
# open spark session
spark = SparkSession.builder.appName('bioindex').getOrCreate()



In [14]:
# functions
# method to load the frequencies
def load_freq(ancestry_name, input_srcdir):
    return spark.read \
        .json('%s/%s/part-*' % (input_srcdir, ancestry_name), sep='\t', header=True, schema=frequency_schema) \
        .select(var_id_col, maf_col.alias(ancestry_name))

# functions
# method to load the frequencies in csv format
def load_freq_csv(ancestry_name, input_srcdir):
    return spark.read \
        .csv('%s/%s/part-*' % (input_srcdir, ancestry_name), sep='\t', header=True, schema=frequency_schema) \
        .select(var_id_col, maf_col.alias(ancestry_name))

# method to get the max of an array
def max_array(array_var):
    max = 0.0                        # maf will never be less than 0
    for element in array_var:
        if (element is not None):
            if (element > max):
                max = element
    return max

# custom function used for sorting chromosomes properly
max_array_udf = udf(max_array, DoubleType())


In [19]:
# setup variables
# ancestries = ['AA', 'AF', 'EA', 'EU', 'HS', 'SA']
ancestries = ['AA', 'EA', 'EU', 'HS', 'SA']
dataframe_freq = None
dataframe_freq_csv = None

var_id = "varId"
maf = 'maf'


# schemas for csv files
# this is the schema written out by the frequency analysis processor
frequency_schema = StructType(
    [
        StructField('varId', StringType(), nullable=False),
        StructField('chromosome', StringType(), nullable=False),
        StructField('position', IntegerType(), nullable=False),
        StructField('reference', StringType(), nullable=False),
        StructField('alt', StringType(), nullable=False),
        StructField('eaf', DoubleType(), nullable=False),
        StructField('maf', DoubleType(), nullable=False),
        StructField('ancestry', StringType(), nullable=False),
    ]
)

# column variables for output
var_id_col = col(var_id)
maf_col = col(maf)


In [15]:
# load frequencies by variant ID
for ancestry in ancestries:
    df = load_freq_csv(ancestry, freq_srcdir_cvs)

    # final, joined frequencies
    dataframe_freq_csv = df if dataframe_freq_csv is None else dataframe_freq_csv.join(df, var_id, how='outer')

# pull all the frequencies together into a single array
dataframe_freq_csv = dataframe_freq_csv.select(dataframe_freq_csv.varId, array(*ancestries).alias('frequency'))
#
# # get the max for all frequencies
dataframe_freq_csv = dataframe_freq_csv.withColumn('maf', max_array_udf('frequency')).select(dataframe_freq_csv.varId, 'maf')


print("the loaded frequency data frame has {} rows".format(dataframe_freq_csv.count()))
dataframe_freq_csv.show()


the loaded frequency data frame has 325263 rows
+----------------+-------------------+
|           varId|                maf|
+----------------+-------------------+
| 10:10014153:G:C|             0.1709|
|10:100153688:C:T| 0.3759333333333334|
|10:100169950:T:C|0.16363333333333333|
|10:100170708:G:A|            0.01095|
|10:100174146:C:T|             0.0541|
|10:100175028:T:A|             0.0619|
|10:100195110:C:T|           1.268E-4|
|10:100202971:C:G|0.01120740953773322|
| 10:10020665:C:A|             0.1707|
|10:100217930:G:A|             0.0495|
|10:100221501:G:C|           2.307E-4|
|10:100249850:C:T|           6.341E-5|
| 10:10025888:T:A|0.32603333333333334|
|10:100281229:A:T|             0.0688|
|10:100344905:T:C| 0.4730666666666667|
|10:100425738:T:C|             0.0915|
|10:100447585:T:A|0.34063333333333334|
|10:100481443:C:T|           1.268E-4|
|10:100539620:A:C|             0.4149|
|10:100550711:C:A|0.07943333333333334|
+----------------+-------------------+
only showing top

In [21]:
# load frequencies by variant ID
for ancestry in ancestries:
    df = load_freq(ancestry, freq_srcdir)

    # final, joined frequencies
    dataframe_freq = df if dataframe_freq is None else dataframe_freq.join(df, var_id, how='outer')

# # pull all the frequencies together into a single array
# dataframe_freq = dataframe_freq.select(dataframe_freq.varId, array(*ancestries).alias('frequency'))
# #
# # # get the max for all frequencies
# dataframe_freq = dataframe_freq.withColumn('maf', max_array_udf('frequency')).select(dataframe_freq.varId, 'maf')


print("the loaded frequency data frame has {} rows".format(dataframe_freq.count()))
dataframe_freq.show()


the loaded frequency data frame has 507581 rows
+--------------------+---+----+----+----+----+----+
|               varId|maf|  AA|  EA|  EU|  HS|  SA|
+--------------------+---+----+----+----+----+----+
|{"varId":"10:1015...|0.0|null|null|null|null|null|
|{"varId":"10:1017...|0.0|null|null|null|null|null|
|{"varId":"10:1020...|0.0|null|null|null|null|null|
|{"varId":"10:1038...|0.0|null|null|null|null|null|
|{"varId":"10:1039...|0.0|null|null|null|null|null|
|{"varId":"10:1057...|0.0|null|null|null|null|null|
|{"varId":"10:1059...|0.0|null|null|null|null|null|
|{"varId":"10:1089...|0.0|null|null|null|null|null|
|{"varId":"10:1090...|0.0|null|null|null|null|null|
|{"varId":"10:1093...|0.0|null|null|null|null|null|
|{"varId":"10:1101...|0.0|null|null|null|null|null|
|{"varId":"10:1106...|0.0|null|null|null|null|null|
|{"varId":"10:1157...|0.0|null|null|null|null|null|
|{"varId":"10:1159...|0.0|null|null|null|null|null|
|{"varId":"10:1163...|0.0|null|null|null|null|null|
|{"varId":"10:11