In [None]:
import os
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
from delta import *
import re
import findspark

findspark.init()

warehouse_diretory_path = '[YOUR_WAREHOUSE]'

conf = SparkConf()
conf.setAll(
    [
        ('spark.master', 'local[*]'), 
        ('spark.driver.host', 'localhost'),
        ('spark.app.name', 'TCGA Firebrowse DNA Methylation Illumina Human Methylation 450 Level 3 - Breast Cancer Sample Importer'),
        ('spark.ui.showConsoleProgress', 'true'),
        ('spark.sql.execution.arrow.pyspark.enabled', 'false'),                   
        ('spark.sql.execution.arrow.pyspark.fallback.enabled', 'true'),
        ('spark.dynamicAllocation.enabled', 'false'),
        ('spark.sql.caseSensitive', 'true'),
        ('spark.sql.adaptive.enabled', 'true'),
        ('spark.memory.offHeap.enabled', 'true'),
        ('spark.memory.offHeap.size', '5g'),  
        ('spark.executor.memory', '30g'),
        ('spark.driver.memory', '20g'),
        ('spark.sql.extensions','io.delta.sql.DeltaSparkSessionExtension'),
        ('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog'),
        ('spark.sql.warehouse.dir', warehouse_diretory_path),
        ('spark.driver.extraJavaOptions', f'-Dderby.system.home={warehouse_diretory_path}')
    ])

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

In [None]:
tcga_dna_methylation_sample_file = 'file:///[FILE_FULL_NAME:BRCA.methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.data.txt]'

disease = 'Breast Invasive Carcinoma'

biological_database_name = 'biological_database'
tcga_dna_methylation_sample_table_name = 'bronze_tcga_firebrowse_dna_methylation_illumina_human_methylation_450_level3_sample'

In [None]:
tcga_dna_methylation_sample_df = spark.read \
    .option('header', True) \
    .options(delimiter='\t') \
    .csv(tcga_dna_methylation_sample_file)

In [None]:
tcga_dna_methylation_sample_df = tcga_dna_methylation_sample_df \
    .select([col(column).alias(re.sub('[^0-9a-zA-Z$]+', '_', column)) for column in tcga_dna_methylation_sample_df.columns]) \
    .withColumn('disease', lit(disease))

In [None]:
spark.sql(f'CREATE DATABASE IF NOT EXISTS {biological_database_name};')
spark.sql(f'USE {biological_database_name};')

In [None]:
tcga_dna_methylation_sample_df.write \
    .format('delta') \
    .mode('overwrite') \
    .option('overwriteSchema', 'true') \
    .option('partitionOverwriteMode', 'dynamic') \
    .partitionBy('disease') \
    .saveAsTable(tcga_dna_methylation_sample_table_name)

In [None]:
spark.stop()