In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, DoubleType, IntegerType
from pyspark.sql.functions import col, struct, explode, when, lit, array_max, array


In [15]:
# specify the frequency file path
freq_path = "/Users/mduby/Data/Broad/Aggregator/BurdenBinning/Frequency"


In [16]:
# open spark session
spark = SparkSession.builder.appName('bioindex').getOrCreate()



In [24]:
# this is the schema written out by the frequency analysis processor
frequency_schema = StructType(
    [
        StructField('varId', StringType(), nullable=False),
        StructField('chromosome', StringType(), nullable=False),
        StructField('position', IntegerType(), nullable=False),
        StructField('reference', StringType(), nullable=False),
        StructField('alt', StringType(), nullable=False),
        StructField('eaf', DoubleType(), nullable=False),
        StructField('maf', DoubleType(), nullable=False),
        StructField('ancestry', StringType(), nullable=False),
    ]
)

In [35]:
# method to load the frequencies
def load_freq(ancestry_name, freq_srcdir):
    return spark.read \
        .csv('%s/%s/part-*' % (freq_srcdir, ancestry_name), sep='\t', header=True, schema=frequency_schema) \
        .select(col('varId'), col('maf').alias(ancestry_name))


In [36]:
# load the frequency data frame
dataframe_frequency = load_freq('AA', freq_path)

# count
print("we have {} frequency data rows".format(dataframe_frequency.count()))

dataframe_frequency.show()

we have 59441 frequency data rows
+----------------+-------------------+
|           varId|                 AA|
+----------------+-------------------+
| 10:10014153:G:C|             0.1709|
|10:100153688:C:T| 0.3759333333333334|
|10:100169950:T:C|0.16363333333333333|
|10:100170708:G:A|            0.01095|
|10:100174146:C:T|             0.0541|
|10:100175028:T:A|             0.0619|
|10:100195110:C:T|           1.268E-4|
|10:100202971:C:G|0.01120740953773322|
| 10:10020665:C:A|             0.1707|
|10:100217930:G:A|             0.0495|
|10:100221501:G:C|           2.307E-4|
|10:100249850:C:T|           6.341E-5|
| 10:10025888:T:A|0.32603333333333334|
|10:100281229:A:T|             0.0688|
|10:100344905:T:C| 0.4730666666666667|
|10:100425738:T:C|             0.0915|
|10:100447585:T:A|0.34063333333333334|
|10:100481443:C:T|           1.268E-4|
|10:100539620:A:C|             0.4149|
|10:100550711:C:A|0.07943333333333334|
+----------------+-------------------+
only showing top 20 rows



In [37]:
# frequency outputs by ancestry
ancestries = ['AA', 'AF', 'EA', 'EU', 'HS', 'SA']
dataframe_freq = None

# load frequencies by variant ID
for ancestry in ancestries:
    df = load_freq(ancestry, freq_path)

    # final, joined frequencies
    dataframe_freq = df if dataframe_freq is None else dataframe_freq.join(df, 'varId', how='outer')

# count
print("Total count for ancestries is: {}\n".format(dataframe_freq.count()))
dataframe_freq.show()


Total count for ancestries is: 325263

+----------------+-------------------+----+----+----+----+----+
|           varId|                 AA|  AF|  EA|  EU|  HS|  SA|
+----------------+-------------------+----+----+----+----+----+
| 10:10014153:G:C|             0.1709|null|null|null|null|null|
|10:100153688:C:T| 0.3759333333333334|null|null|null|null|null|
|10:100169950:T:C|0.16363333333333333|null|null|null|null|null|
|10:100170708:G:A|            0.01095|null|null|null|null|null|
|10:100174146:C:T|             0.0541|null|null|null|null|null|
|10:100175028:T:A|             0.0619|null|null|null|null|null|
|10:100195110:C:T|           1.268E-4|null|null|null|null|null|
|10:100202971:C:G|0.01120740953773322|null|null|null|null|null|
| 10:10020665:C:A|             0.1707|null|null|null|null|null|
|10:100217930:G:A|             0.0495|null|null|null|null|null|
|10:100221501:G:C|           2.307E-4|null|null|null|null|null|
|10:100249850:C:T|           6.341E-5|null|null|null|null|null|
|

In [48]:
# pull all the frequencies together into a single map
freq = dataframe_freq.select(dataframe_freq.varId, array(*ancestries).alias('frequency'))

# count
print("Total count for ancestries is: {}\n".format(freq.count()))
freq.show()

Total count for ancestries is: 325263

+----------------+--------------------+
|           varId|           frequency|
+----------------+--------------------+
| 10:10014153:G:C|       [0.1709,,,,,]|
|10:100153688:C:T|[0.37593333333333...|
|10:100169950:T:C|[0.16363333333333...|
|10:100170708:G:A|      [0.01095,,,,,]|
|10:100174146:C:T|       [0.0541,,,,,]|
|10:100175028:T:A|       [0.0619,,,,,]|
|10:100195110:C:T|     [1.268E-4,,,,,]|
|10:100202971:C:G|[0.01120740953773...|
| 10:10020665:C:A|       [0.1707,,,,,]|
|10:100217930:G:A|       [0.0495,,,,,]|
|10:100221501:G:C|     [2.307E-4,,,,,]|
|10:100249850:C:T|     [6.341E-5,,,,,]|
| 10:10025888:T:A|[0.32603333333333...|
|10:100281229:A:T|       [0.0688,,,,,]|
|10:100344905:T:C|[0.47306666666666...|
|10:100425738:T:C|       [0.0915,,,,,]|
|10:100447585:T:A|[0.34063333333333...|
|10:100481443:C:T|     [1.268E-4,,,,,]|
|10:100539620:A:C|       [0.4149,,,,,]|
|10:100550711:C:A|[0.07943333333333...|
+----------------+--------------------+
o

In [50]:
# get the max for all frequencies
max_freq = freq.select(freq.varId, array_max('frequency').alias('max_freq'))

# count
print("Total count for max frequencies is: {}\n".format(max_freq.count()))
max_freq.show()

Total count for max frequencies is: 325263

+----------------+-------------------+
|           varId|           max_freq|
+----------------+-------------------+
| 10:10014153:G:C|             0.1709|
|10:100153688:C:T| 0.3759333333333334|
|10:100169950:T:C|0.16363333333333333|
|10:100170708:G:A|            0.01095|
|10:100174146:C:T|             0.0541|
|10:100175028:T:A|             0.0619|
|10:100195110:C:T|           1.268E-4|
|10:100202971:C:G|0.01120740953773322|
| 10:10020665:C:A|             0.1707|
|10:100217930:G:A|             0.0495|
|10:100221501:G:C|           2.307E-4|
|10:100249850:C:T|           6.341E-5|
| 10:10025888:T:A|0.32603333333333334|
|10:100281229:A:T|             0.0688|
|10:100344905:T:C| 0.4730666666666667|
|10:100425738:T:C|             0.0915|
|10:100447585:T:A|0.34063333333333334|
|10:100481443:C:T|           1.268E-4|
|10:100539620:A:C|             0.4149|
|10:100550711:C:A|0.07943333333333334|
+----------------+-------------------+
only showing top 20 

In [52]:
# done
spark.stop()
