In [None]:
from pyspark.sql.functions import explode, col, lit, xxhash64

# Import glow.py and register Glow package
import glow
glow.register(spark)

In [None]:
# Configure session credentials
# Set up a SAS for a container with public data - no changes needed here (public SAS)

spark.conf.set(
  "fs.azure.sas.dataset.dataset1000genomes.blob.core.windows.net",
  "sv=2019-10-10&si=prod&sr=c&sig=9nzcxaQn0NprMPlSh4RhFQHcXedLQIcFgbERiooHEqM%3D")


In [None]:
# Provide your storage account, container and SAS token
outputStorageAccount =  
outputContainer = 
outputSAS = 
outputDir = 

In [None]:
# Set up a SAS for a container to store .parquet files
spark.conf.set(
  "fs.azure.sas."+outputContainer+"."+outputStorageAccount+".blob.core.windows.net", outputSAS)


In [None]:
# List files for latest release (20130502)

dbutils.fs.ls("wasbs://dataset@dataset1000genomes.blob.core.windows.net/release/20130502")

In [None]:
# Read in data for chr 22 with flatten info fields and sample ids

source = "wasbs://dataset@dataset1000genomes.blob.core.windows.net/release/20130502/ALL.chr22*.vcf.gz"

data = spark.read\
  .format("vcf")\
  .option("includeSampleIds", True)\
  .option("flattenInfoFields", True)\
  .load(source)

In [None]:
# Look at regular sites

data.where("INFO_MULTI_ALLELIC = FALSE").show(2)

In [None]:
# Look at multiallelic sites

data.where("INFO_MULTI_ALLELIC = TRUE").show(2)

In [None]:
# Let's add hashId column, hash is built on all columns except genotypes
# Later hasId might be used as unique id for variants 

hashCols = list(set(data.columns) - {'genotypes'})
dataHashed = data.withColumn('hashId', xxhash64(*hashCols))

In [None]:
# Write out data in .parquet format to your storage account

hashVariants = True 

if hashVariants:
  sink = "wasbs://"+outputContainer + "@" + outputStorageAccount + ".blob.core.windows.net"+ outputDir + "/hashed/chr22"
  dataHashed.write. \
    mode("overwrite"). \
    format("parquet"). \
    save(sink)
else:
  sink = "wasbs://"+outputContainer + "@" + outputStorageAccount + ".blob.core.windows.net"+ outputDir + "/original/chr22" 
  data.write. \
    mode("overwrite"). \
    format("parquet"). \
    save(sink)
  

In [None]:
# Explode on genotypes

dataExploded = dataHashed.withColumn('genotypes', explode('genotypes'))


In [None]:
# Flatten struct columns - genotypes column in this case

def flattenStructFields(df):
  flat_cols = [c[0] for c in df.dtypes if c[1][:6] != 'struct']
  nested_cols = [c[0] for c in df.dtypes if c[1][:6] =='struct']
  flat_df = df.select(flat_cols + 
                     [col(nc+'.'+c).alias(nc+'_'+c)
                     for nc in nested_cols
                     for c in df.select(nc+'.*').columns])
  return flat_df

dataExplodedFlatten = flattenStructFields(dataExploded)

In [None]:
# Look at regular sites
display(dataExplodedFlatten.where("INFO_MULTI_ALLELIC = FALSE"))

In [None]:
# Look at multiallelic sites
display(dataExplodedFlatten.where("INFO_MULTI_ALLELIC = TRUE"))

In [None]:
# That's final point where all transformations will be made, it'll be long and expensive step - you might need bigger cluster to complete it
# Write out exploded data to your storage account

sink = "wasbs://"+outputContainer + "@" + outputStorageAccount + ".blob.core.windows.net"+ outputDir + "/exploded/chr22" 

dataExplodedFlatten.write. \
  mode("overwrite"). \
  format("parquet"). \
  save(sink)