In [1]:
import os
os.environ["JAVA_HOME"] = "/mnt/extproj/projekte/textmining/jdk/openlogic-openjdk-11.0.22+7-linux-x64"


In [2]:
from collections import defaultdict
from pyspark.sql import SparkSession

from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.sql import Window

from collections import defaultdict

In [3]:
import glob 
def read_pmid2pmc(infolder):
    pmid2pmc = defaultdict(set)    
    for file in glob.glob(infolder + "/*.pmid"):
        
        for line in open(file):
            line = line.strip().split("\t")
            
            pmc = line[0]
            pmid = line[1]
            
            pmid2pmc[pmid].add(pmc)
            
    return pmid2pmc

pmid2pmc = read_pmid2pmc("/mnt/extproj/projekte/textmining/pmc_feb24/oa_comm/")

In [4]:
def read_mirexplore_entries(inputpath, organism, ignore_docs=None):
    db_entries = []

    ignoredEntries = set()
    iline=0
    with open(inputpath, "r") as fin:
        
        for iline, line in enumerate(fin):
            aline = line.split("\t")
            #print(aline)
            
            
            mir_family = aline[0]
            mir_name = aline[1]
            
            gene_family = aline[3]
            gene_name = aline[4]
            
            if gene_name.startswith("MIR-"):
                #set(df.filter((df["gene_name"].startswith("MIR-") | df["gene_name"].startswith("mir-"))).select("gene_name").collect())
                #{Row(gene_name='MIR-1'),
                #Row(gene_name='MIR-10'),
                #Row(gene_name='MIR-1201'),
                #Row(gene_name='MIR-1259'),
                #Row(gene_name='MIR-16'),
                #Row(gene_name='MIR-3607'),
                #Row(gene_name='MIR-886')}
                continue
            
            #df.groupBy(['gene_name']).count().orderBy("count", ascending=False).show(n=100)
            if gene_name in ["transcription factor", "osteosarcoma", "insulin", "RNA binding protein", "RNA-binding protein", "lipoprotein"]:
                continue
                        
            evidence_source = "mirexplore"
            evidence_document = aline[6]
            
            if not ignore_docs is None and evidence_document in ignore_docs:
                ignoredEntries.add(evidence_document)
                continue
            
            evidence_same_paragraph = aline[7] == "True"
            evidence_same_sentence = aline[8] == "True"
            
            document_evidences = eval(aline[9])
                            
            for evidence in document_evidences:
                interaction_direction = evidence[17]
                interaction_type = evidence[18]
                
                if interaction_type == "CHNAGE":
                    interaction_type = "CHANGE"
                
                interaction_sentence = evidence[4]
                
                if interaction_direction == "MIR_GENE":
                    interaction_location_mir = evidence[6]
                    interaction_location_gene = evidence[7]
                elif interaction_direction == "GENE_MIR":
                    interaction_location_mir = evidence[7]
                    interaction_location_gene = evidence[6]
                else:
                    assert(False)
        
            
                db_entries.append(
                    (mir_family, mir_name, gene_family, gene_name, organism, interaction_direction, interaction_type, evidence_source, evidence_document, interaction_sentence, interaction_location_mir, interaction_location_gene)
                )
                
    print("Ignored Entries", len(ignoredEntries), "of", iline)
                
    return db_entries


In [5]:
hsaMxRelations = read_mirexplore_entries("/mnt/extproj/projekte/textmining/mx_feb24/aggregated_pubmed/mirna_gene.hsa.pmid", "hsa", pmid2pmc)
mmuMxRelations = read_mirexplore_entries("/mnt/extproj/projekte/textmining/mx_feb24/aggregated_pubmed/mirna_gene.mmu.pmid", "mmu", pmid2pmc)
#old:
#Ignored Entries 13998 of 375563
#Ignored Entries 11818 of 223089

Ignored Entries 13066 of 290200
Ignored Entries 11161 of 191222


In [6]:
hsaMxRelationsPMC = read_mirexplore_entries("/mnt/extproj/projekte/textmining/mx_feb24/aggregated_pmc/mirna_gene.hsa.pmid", "hsa")
mmuMxRelationsPMC = read_mirexplore_entries("/mnt/extproj/projekte/textmining/mx_feb24/aggregated_pmc/mirna_gene.mmu.pmid", "mmu")

#old:
#Ignored Entries 0 of 1631185
#Ignored Entries 0 of 997908

Ignored Entries 0 of 1032109
Ignored Entries 0 of 670334


In [7]:
allTMRelations = defaultdict(list)
from intervaltree import Interval, IntervalTree

In [8]:
def deduplicate_relation_lists( list1, list2 ):
    
    for entry in list1+list2:
        # 0=mir, 2=gene, 5=int_dir, 6=int_type, 9=int_sent
        keyTuple = (entry[0], entry[2], entry[5], entry[6], entry[9])
        allTMRelations[keyTuple].append(entry)
        
    keptTMRelations = defaultdict(list)

    for key in allTMRelations:
        
        if len(allTMRelations[key]) < 2:
            keptTMRelations[key] = allTMRelations[key]
            continue
        
        
        mi_t = IntervalTree()
        g_t = IntervalTree()
        
        for evi, ev in enumerate(allTMRelations[key]):
        
            m_int = Interval(ev[10][0], ev[10][1], evi)
            g_int = Interval(ev[11][0], ev[11][1], evi)
            
            mi_t.add(m_int)
            g_t.add(g_int)
            
        mi_t.merge_overlaps(data_reducer=lambda x,y: list(x + [y]), data_initializer=list())
        g_t.merge_overlaps(data_reducer=lambda x,y: list(x + [y]), data_initializer=list())

        seenIndices = set()

        kept_evs = []
        for evi, ev in enumerate(allTMRelations[key]):
            
            if evi in seenIndices:
                continue
            
            evInterval_m = [x for x in mi_t if evi in x.data][0]
            evInterval_g = [x for x in mi_t if evi in x.data][0]
            
            combinableEvs = set(evInterval_m.data).intersection(evInterval_g.data)
            
            seenIndices.update(combinableEvs)
            
            ev = list(ev)
            combinableOrgs = ";".join(set(sorted([allTMRelations[key][y][4] for y in combinableEvs])))
            ev[4] = combinableOrgs
            ev = tuple(ev)
            
            
            kept_evs.append(ev)
            
            #print(evi, "speaks for", combinableEvs)
        keptTMRelations[key] = kept_evs
        
    allkeptrelations = []
    for key in keptTMRelations:
        allkeptrelations += keptTMRelations[key]
        
    return allkeptrelations

In [9]:
pubmed_relations = deduplicate_relation_lists(hsaMxRelations, mmuMxRelations)
pmc_relations = deduplicate_relation_lists(hsaMxRelationsPMC, mmuMxRelationsPMC)

In [10]:
from pyspark.sql import SparkSession 

In [11]:
if not "spark" in globals() or spark is None:
    print("Creating builder")
    spark = SparkSession.builder.\
            config("spark.executor.memory", "70g").\
            config("spark.driver.memory", "50g").\
            config("spark.memory.offHeap.enabled",True).\
            config("spark.memory.offHeap.size","16g").\
            config("spark.sql.shuffle.partitions", 300).\
            appName('mirexplore').getOrCreate()

Creating builder




In [12]:
df = spark.createDataFrame(pubmed_relations+pmc_relations, ["miRNA_family", "miRNA_name", "gene_family", "gene_name", "organism", "interaction_direction", "interaction_type", "evidence_source", "evidence_document", "interaction_sentence", "interaction_location_miRNA", "interaction_location_gene"])

In [13]:
df.show()

+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+--------------------------+-------------------------+
|miRNA_family|  miRNA_name|gene_family|  gene_name|organism|interaction_direction|interaction_type|evidence_source|evidence_document|interaction_sentence|interaction_location_miRNA|interaction_location_gene|
+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+--------------------------+-------------------------+
|     miR-140|  miR-140-5p|       TLR4|       TLR4| mmu;hsa|             MIR_GENE|             NEU|     mirexplore|         37593020|        37593020.1.1|                {108, 118}|               {131, 135}|
|     miR-155|     miR-155|      TACR1|        SPR|     hsa|             MIR_GENE|             NEU|     mirexplore|         37901511|        37901511.2.5|              

In [14]:
schema_sent = StructType([
    StructField("sent", StringType(), False),
    StructField('loc_miRNRA', StructType([StructField("start_", LongType(), True), StructField("end_", LongType(), False)])),
    StructField('loc_gene', StructType([StructField("start_", LongType(), True), StructField("end_", LongType(), False)]))
])

ev_loc_udf = udf(
    lambda x,y,z: (x, y, z),
    schema_sent
)


schema_int = StructType([
    StructField("int_dir", StringType(), False),
    StructField('int_type', StringType(), False)
    ])

reg_ev_udf = udf(
    lambda x,y: (x, y),
    schema_int
)

df = df.withColumn("sent_evidence", ev_loc_udf("interaction_sentence", "interaction_location_miRNA", "interaction_location_gene"))
df = df.withColumn("interaction", reg_ev_udf("interaction_direction", "interaction_type"))
df.show()

+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+--------------------------+-------------------------+--------------------+----------------+
|miRNA_family|  miRNA_name|gene_family|  gene_name|organism|interaction_direction|interaction_type|evidence_source|evidence_document|interaction_sentence|interaction_location_miRNA|interaction_location_gene|       sent_evidence|     interaction|
+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+--------------------------+-------------------------+--------------------+----------------+
|     miR-140|  miR-140-5p|       TLR4|       TLR4| mmu;hsa|             MIR_GENE|             NEU|     mirexplore|         37593020|        37593020.1.1|                {108, 118}|               {131, 135}|{37593020.1.1, {1...| {MIR_GENE, NEU}|
|     miR-155|  

In [15]:
df = df.drop("interaction_sentence")
df = df.drop("interaction_location_miRNA")
df = df.drop("interaction_location_gene")

In [16]:
df.printSchema()

root
 |-- miRNA_family: string (nullable = true)
 |-- miRNA_name: string (nullable = true)
 |-- gene_family: string (nullable = true)
 |-- gene_name: string (nullable = true)
 |-- organism: string (nullable = true)
 |-- interaction_direction: string (nullable = true)
 |-- interaction_type: string (nullable = true)
 |-- evidence_source: string (nullable = true)
 |-- evidence_document: string (nullable = true)
 |-- sent_evidence: struct (nullable = true)
 |    |-- sent: string (nullable = false)
 |    |-- loc_miRNRA: struct (nullable = true)
 |    |    |-- start_: long (nullable = true)
 |    |    |-- end_: long (nullable = false)
 |    |-- loc_gene: struct (nullable = true)
 |    |    |-- start_: long (nullable = true)
 |    |    |-- end_: long (nullable = false)
 |-- interaction: struct (nullable = true)
 |    |-- int_dir: string (nullable = false)
 |    |-- int_type: string (nullable = false)



In [17]:
df = df.withColumn('miRNA_family', regexp_replace('miRNA_family', '__', ''))
df.show()

+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+----------------+
|miRNA_family|  miRNA_name|gene_family|  gene_name|organism|interaction_direction|interaction_type|evidence_source|evidence_document|       sent_evidence|     interaction|
+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+----------------+
|     miR-140|  miR-140-5p|       TLR4|       TLR4| mmu;hsa|             MIR_GENE|             NEU|     mirexplore|         37593020|{37593020.1.1, {1...| {MIR_GENE, NEU}|
|     miR-155|     miR-155|      TACR1|        SPR|     hsa|             MIR_GENE|             NEU|     mirexplore|         37901511|{37901511.2.5, {1...| {MIR_GENE, NEU}|
|     miR-155|     miR-155|        SPR|        SPR| mmu;hsa|             MIR_GENE|             NEU|     mirexplore|         37901511|{379015

In [18]:
df.write.parquet("/mnt/extproj/projekte/textmining/mx_feb24/mx_mirna_gene_parquet", mode="overwrite")

In [19]:
hsaMxRelations=None
mmuMxRelations=None

In [20]:
df.show()

+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+----------------+
|miRNA_family|  miRNA_name|gene_family|  gene_name|organism|interaction_direction|interaction_type|evidence_source|evidence_document|       sent_evidence|     interaction|
+------------+------------+-----------+-----------+--------+---------------------+----------------+---------------+-----------------+--------------------+----------------+
|     miR-140|  miR-140-5p|       TLR4|       TLR4| mmu;hsa|             MIR_GENE|             NEU|     mirexplore|         37593020|{37593020.1.1, {1...| {MIR_GENE, NEU}|
|     miR-155|     miR-155|      TACR1|        SPR|     hsa|             MIR_GENE|             NEU|     mirexplore|         37901511|{37901511.2.5, {1...| {MIR_GENE, NEU}|
|     miR-155|     miR-155|        SPR|        SPR| mmu;hsa|             MIR_GENE|             NEU|     mirexplore|         37901511|{379015

In [21]:
df.groupBy("gene_family", 'gene_name').count().sort("count", ascending=False).head(50)


[Row(gene_family='ITK', gene_name='EMT', count=20987),
 Row(gene_family='SLC22A3', gene_name='EMT', count=20987),
 Row(gene_family='PTEN', gene_name='PTEN', count=17115),
 Row(gene_family='TP53', gene_name='p53', count=11148),
 Row(gene_family='IL6', gene_name='IL-6', count=7949),
 Row(gene_family='CDH17', gene_name='cadherin', count=7932),
 Row(gene_family='TNF', gene_name='TNF-α', count=7815),
 Row(gene_family='SIRT1', gene_name='SIRT1', count=7140),
 Row(gene_family='STAT3', gene_name='STAT3', count=6685),
 Row(gene_family='MALAT1', gene_name='MALAT1', count=6636),
 Row(gene_family='CDH1', gene_name='E-Cadherin', count=6176),
 Row(gene_family='ZEB1', gene_name='ZEB1', count=6115),
 Row(gene_family='BCL2', gene_name='Bcl-2', count=5865),
 Row(gene_family='IL1B', gene_name='IL-1β', count=5627),
 Row(gene_family='HIF1A', gene_name='HIF-1α', count=5600),
 Row(gene_family='H19', gene_name='H19', count=5170),
 Row(gene_family='VEGFA', gene_name='VEGF', count=5168),
 Row(gene_family='IRF6'