In [None]:
import os
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list, col
from pyspark.sql.types import StructType, StructField, StringType
from delta import *

warehouse_diretory_path = '[YOUR_WAREHOUSE]'
biological_database_name = 'biological_database'

silver_homo_sapiens_gene_annotation_table_name = 'silver_homo_sapiens_gene_annotation'

bronze_ensembl_gene_entrez_annotation_table_name = 'bronze_ensembl_gene_entrez_annotation'
bronze_bronze_ensembl_gene_uniprot_annotation_table_name = 'bronze_ensembl_gene_uniprot_annotation'

bronze_hgnc_gene_annotation_table_name = 'bronze_hgnc_gene_annotation'

bronze_uniprotkb_reviewed_protein_annotation_table_name = 'bronze_uniprotkb_reviewed_protein_annotation'

bronze_ncbi_gene_go_table_name = 'bronze_ncbi_gene_go'
bronze_ncbi_gene_annotation_table_name = 'bronze_ncbi_gene_annotation'

bronze_mirtarbase_mirna_target_gene_interaction_table_name = 'bronze_mirtarbase_mirna_target_gene_interaction'

ncbi_homo_sapiens_taxonomy = 9606

conf = SparkConf()
conf.setAll(
    [
        ('spark.master', 'local[*]'), 
        ('spark.driver.host', 'localhost'),
        ('spark.app.name', 'Gene Annotation Importer'),
        ('spark.ui.showConsoleProgress', 'true'),
        ('spark.sql.execution.arrow.pyspark.enabled', 'true'),         
        ('spark.sql.extensions','io.delta.sql.DeltaSparkSessionExtension'),
        ('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog'),
        ('spark.sql.warehouse.dir', warehouse_diretory_path),
        ('spark.driver.extraJavaOptions', f'-Dderby.system.home={warehouse_diretory_path}')
    ])

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

In [None]:
spark.sql(f'USE {biological_database_name};')


In [None]:
bronze_mirtarbase_mirna_target_gene_interaction_df = spark.sql(f"""SELECT 
                                                                       Target_Gene_Entrez_Gene_ID AS entrez_id, 
                                                                       miRTarBase_ID AS mirtarbase_id, 
                                                                       miRNA AS mirna_id
                                                                   FROM {bronze_mirtarbase_mirna_target_gene_interaction_table_name}""") \
                                                          .dropDuplicates(['entrez_id', 'mirtarbase_id', 'mirna_id']) \
                                                          .selectExpr('entrez_id', 
                                                                      """named_struct(                                                 
                                                                             'mirtarbase_id', mirtarbase_id,
                                                                             'mirna_id', mirna_id
                                                                         ) AS mirna_interaction""") \
                                                          .groupBy('entrez_id') \
                                                          .agg(collect_list('mirna_interaction').alias('mirna_interactions'))
                                                          

In [None]:
bronze_ncbi_gene_annotation_df = spark.sql(f"""SELECT regexp_extract(dbXrefs, '(HGNC:[0-9]+)', 1) AS hgnc_id, GeneID as entrez_id 
                                               FROM {bronze_ncbi_gene_annotation_table_name} AS g
                                               WHERE `#tax_id` == {ncbi_homo_sapiens_taxonomy} AND 
                                                     dbXrefs like '%HGNC:HGNC%'""")

bronze_ncbi_gene_go_df = spark.sql(f"""SELECT GeneID AS entrez_id, GO_ID AS id, Category AS category,                                       
                                              collect_list(
                                                      named_struct(                                                 
                                                            'term', GO_term,
                                                            'qualifier', Qualifier,
                                                            'evidence', Evidence,
                                                            'pubmed_id', array_remove(split(replace(PubMed, '-', ''), '[|]'), '')
                                                      )
                                              ) AS terms
                                       FROM {bronze_ncbi_gene_go_table_name} 
                                       WHERE `#tax_id` == {ncbi_homo_sapiens_taxonomy}
                                       GROUP BY GeneID, Category, GO_ID""") \
                              .selectExpr('entrez_id', 
                                          """named_struct(                                                 
                                                'id', id,
                                                'category', category,
                                                'terms', terms
                                          ) AS go""") \
                              .groupBy('entrez_id') \
                              .agg(collect_list(col('go')).alias('go'))

bronze_ncbi_gene_annotation_df = bronze_ncbi_gene_annotation_df.alias('ncbi_annotation') \
                                                               .join(bronze_ncbi_gene_go_df.alias('ncbi_go'),
                                                                     bronze_ncbi_gene_go_df.entrez_id == bronze_ncbi_gene_annotation_df.entrez_id,
                                                                     'left') \
                                                               .select('ncbi_annotation.hgnc_id', 'ncbi_annotation.entrez_id', 'ncbi_go.go') \
                                                               .join(bronze_mirtarbase_mirna_target_gene_interaction_df.alias('mirna'), ['entrez_id'], 'left') 

In [None]:
bronze_hgnc_gene_df = spark.sql(f"""SELECT symbol, alias_symbol AS symbol_synonyms, prev_symbol AS previous_symbol, name, 
                                    alias_name AS name_synonyms, prev_name AS previous_name, gene_group_id AS family_id, 
                                    gene_group AS family, location, location_sortable, hgnc_id, vega_id, ucsc_id, ena AS ena_id, 
                                    refseq_accession AS refseq_id, ccds_id, lsdb AS lsm_db_id, cosmic AS cosmic_id, mgd_id, rgd_id,
                                    orphanet AS orphanet_id, pubmed_id, `pseudogene.org` AS pseudogene_id, horde_id, cd AS hcdm_id,
                                    imgt AS imgt_symbol, merops AS merops_id, iuphar AS iuphar_id, mirbase AS mirbase_id, omim_id,
                                    kznf_gene_catalog AS kznf_gene_catalog_id, `mamit-trnadb` AS mamit_trna_db_id, homeodb AS homeo_db_id, 
                                    snornabase AS snornabase_id, bioparadigms_slc AS bioparadigms_slc_id, lncrnadb AS lncrna_db_id, 
                                    enzyme_id AS enzyme_ec_accession_number, intermediate_filament_db AS intermediate_filament_db_id 
                                    FROM {bronze_hgnc_gene_annotation_table_name}""")

In [None]:
bronze_uniprotkb_reviewed_protein_annotation_df = spark.sql(f"""SELECT 
                                                                    accession,
                                                                    explode(gene.name) AS gene
                                                                FROM {bronze_uniprotkb_reviewed_protein_annotation_table_name}
                                                                WHERE organism.dbReference._id = {ncbi_homo_sapiens_taxonomy}""") \
                                                        .selectExpr('explode(accession) AS uniprotkb_ids', 'gene') \
                                                        .selectExpr('uniprotkb_ids', 'explode(gene) AS gene') \
                                                        .selectExpr('uniprotkb_ids', 'gene._VALUE AS gene_symbol', 'gene._type AS type') \
                                                        .filter("type == 'primary'") \
                                                        .groupBy('gene_symbol') \
                                                        .agg(collect_list(col('uniprotkb_ids')).alias('uniprotkb_ids'))

In [None]:
silver_homo_sapiens_gene_annotation_df = bronze_hgnc_gene_df.alias('hgnb') \
                                               .join(bronze_ncbi_gene_annotation_df.alias('ncbi'), ['hgnc_id'], 'left') \
                                               .join(bronze_uniprotkb_reviewed_protein_annotation_df.alias('uniprot'), 
                                                     bronze_hgnc_gene_df.symbol == bronze_uniprotkb_reviewed_protein_annotation_df.gene_symbol, 
                                                     'left') \
                                               .drop(bronze_uniprotkb_reviewed_protein_annotation_df.gene_symbol)

In [None]:
silver_homo_sapiens_gene_annotation_df.write \
    .format('delta') \
    .mode('overwrite') \
    .option('overwriteSchema', 'true') \
    .option('partitionOverwriteMode', 'dynamic') \
    .saveAsTable(silver_homo_sapiens_gene_annotation_table_name)

In [None]:
spark.stop()