In [None]:
import os
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from delta import *
import findspark

findspark.init()

warehouse_diretory_path = '[YOUR_WAREHOUSE]'
uniprot_reviewed_protein_annotation_file = 'file:///[FILE_FULL_NAME:uniprot_sprot.xml]'
biological_database_name = 'biological_database'
uniprot_reviewed_protein_annotation_table_name = 'bronze_uniprotkb_reviewed_protein_annotation'

conf = SparkConf()
conf.setAll(
    [
        ('spark.master', 'local[*]'), 
        ('spark.driver.host', 'localhost'),
        ('spark.app.name', 'UniprotKB Reviewed Protein Annotation Importer'),
        ('spark.ui.showConsoleProgress', 'true'),
        ('spark.sql.execution.arrow.pyspark.enabled', 'false'),
        ('spark.storage.memoryFraction', '0.6'),
        ('spark.sql.execution.arrow.pyspark.fallback.enabled', 'true'),
        ('spark.dynamicAllocation.enabled', 'false'),
        ('spark.sql.caseSensitive', 'true'),
        ('spark.sql.adaptive.enabled', 'true'),
        ('spark.sql.extensions','io.delta.sql.DeltaSparkSessionExtension'),
        ('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog'),
        ('spark.sql.warehouse.dir', warehouse_diretory_path),
        ('spark.driver.extraJavaOptions', f'-Dderby.system.home={warehouse_diretory_path}')
    ])

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

In [None]:
uniprot_reviewed_protein_annotation_df = spark.read \
    .format('xml') \
    .options(rowTag='entry') \
    .load(uniprot_reviewed_protein_annotation_file) \
    .selectExpr('_created', '_dataset', '_modified', '_version', '_xmlns', 'accession', 'comment', 'evidence', 'feature', 'gene',
                'geneLocation', 'keyword', 'name', 'organism', 'organismHost', 'protein', 'proteinExistence', 'reference', 'sequence',
                """CAST(dbReference AS array<struct<_VALUE:string,
                                                    _evidence:string,
                                                    _id:string,
                                                    _type:string,
                                                    molecule:struct<_VALUE:string,_id:string>,
                                                    property:array<struct<_VALUE:string,_type:string,_val:string>>>>) dbReference""")

In [None]:
spark.sql(f'CREATE DATABASE IF NOT EXISTS {biological_database_name};')

In [None]:
spark.sql(f'USE {biological_database_name};')

In [None]:
uniprot_reviewed_protein_annotation_df.write \
    .format('delta') \
    .mode('overwrite') \
    .option('overwriteSchema', 'true') \
    .option('partitionOverwriteMode', 'dynamic') \
    .saveAsTable(uniprot_reviewed_protein_annotation_table_name)

In [None]:
spark.stop()