In [19]:
from pyspark.sql.functions import col, xxhash64
from notebookutils import mssparkutils
import re

# Import glow.py and register Glow package
import glow
spark = glow.register(spark)

In [20]:
# Provide names for output storage account, container and relative path

outputStorageAccount = 'Your account name' # replace with your account name
outputContainer = 'Your container name' # replace with your container name
outputDir = 'Your path' # replace with your relative path


# Here we assume that Azure Synapse Analytics is used and outputStorageAccount is a primary storage account in the workspace - no auth needed in this case 
# For other Synapse scenarios check https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/microsoft-spark-utilities?pivots=programming-language-python
# For Azure Databricks check https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/ and https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/azure-storage


In [21]:
# Define source public data

inputStorageAccount = 'azureopendatastorage'
inputContainer = 'gnomad'
inputDir = 'release/2.1.1/vcf'

In [22]:
# Read, transform and write data

def TransformData(source, sink, colsToDrop, colsToKeepAsArray):
# Read data
  data = spark.read. \
    format('vcf'). \
    load(source)
# Drop columns
  dataReduced = data
  for column in colsToDrop:
    dataReduced = dataReduced.drop(column)
# Add hashId column
  hashCols = dataReduced.columns
  dataHashed = dataReduced.withColumn('hashId', xxhash64(*hashCols))
# Replace arrays by first element  
  colsToReplaceByFirstElement = []
  for x, t in dataHashed.dtypes:
    if t.startswith('array'):
      colsToReplaceByFirstElement.append(x)
  colsToReplaceByFirstElement = list(set(colsToReplaceByFirstElement) - set(colsToKeepAsArray))
  dataTransformed = dataHashed
  for column in colsToReplaceByFirstElement:
    dataTransformed = dataTransformed.withColumn(column, col(column)[0])
# Write data   
  dataTransformed.write. \
    mode('overwrite'). \
    format('parquet'). \
    save(sink)

In [23]:
# Input files end with suffix 'chromosome.vcf.bgz', where chromosome might be a number from 1 to 22 or X or Y
sourceSuffix = '[XY|0-9][.]vcf[.]bgz$'

# Columns to drop and to keep as array
colsToDrop = ['genotypes']
colsToKeepAsArray = ['INFO_vep']

# Datasets to process
datasets = ['genomes']

for dataset in datasets:
  sourcePath = 'wasbs://%s@%s.blob.core.windows.net/%s/%s' % (inputContainer, inputStorageAccount, inputDir, dataset)   
  files = mssparkutils.fs.ls(sourcePath)
  for file in files:
    if re.search(sourceSuffix, file.name):
      source = file.path
      sink = 'abfss://%s@%s.dfs.core.windows.net/%s/%s/%s' % (outputContainer, outputStorageAccount, outputDir, dataset, file.name.rstrip('.vcf.bgz')) 
      TransformData(source, sink, colsToDrop, colsToKeepAsArray)