In [9]:
import os
os.environ["JAVA_HOME"] = "/mnt/extproj/projekte/textmining/markus_tm_projs/jdk/openlogic-openjdk-11.0.22+7-linux-x64"

MXPLORE_PATH="/mnt/extproj/projekte/textmining/mxplore/"

from collections import Counter

In [5]:
from collections import defaultdict
from pyspark.sql import SparkSession

from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.sql import Window


In [6]:
if not "spark" in globals() or spark is None:
    print("Creating builder")
    spark = SparkSession.builder.\
            config("spark.executor.memory", "70g").\
            config("spark.driver.memory", "50g").\
            config("spark.memory.offHeap.enabled",True).\
            config("spark.memory.offHeap.size","16g").\
            config("spark.sql.shuffle.partitions", 300).\
            appName('mirexplore').getOrCreate()
    
#spark.conf.set("spark.executor.memory", "70g")
#spark.conf.set("spark.driver.memory", "50g")
#spark.conf.set("spark.memory.offHeap.enabled",True)
#spark.conf.set("spark.memory.offHeap.size","16g")  
#spark.conf.set("spark.sql.shuffle.partitions", 300)

Creating builder




In [4]:
# df = spark.createDataFrame(hsaMxRelations+mmuMxRelations, ["miRNA_family", "miRNA_name", "gene_family", "gene_name", "organism", "interaction_direction", "interaction_type", "evidence_source", "evidence_document", "interaction_sentence", "interaction_location_miRNA", "interaction_location_gene"])

In [7]:
def read_annotation_file(inputpath, annotation, excludes=None):
    db_entries = []
    ignoredConcepts = Counter()

    with open(inputpath, "r") as fin:
        
        for iline, line in enumerate(fin):
            aline = line.split("\t")
            
            if len(aline) < 3:
                print(line)

            docID = aline[0]
            conceptID = aline[1]
            concept = aline[2]
            
            if not excludes is None:
                if concept.upper() in excludes:
                    ignoredConcepts[concept.upper()] += 1
                    continue
            
            sents = eval(aline[3])
            
            for sent in sents:
                db_entries.append(
                    (docID, annotation, conceptID, concept, sent[0], (sent[1], sent[2]))
                )
            

    print(annotation, len(db_entries))
    if len(ignoredConcepts) > 0:
        print("Ignored", ignoredConcepts.most_common(10))
                
    return db_entries

def makeExcludes( inEntries ):
    excludes = set()
    
    for entry in inEntries:
        #concept is at pos 3!
        excludes.add(entry[3].upper())
    return excludes

In [11]:
diseaseAnnot = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pubmed/disease.pmid", "disease")
celllinesAnnot = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pubmed/celllines.pmid", "celllines")
goAnnot = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pubmed/go.pmid", "GeneOntology")
modelanatAnnot = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pubmed/model_anatomy.pmid", "ModelAnatomy")

excludeConcepts = makeExcludes(diseaseAnnot + celllinesAnnot +goAnnot + modelanatAnnot)

ncitAnnot = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pubmed/ncit.pmid", "ncit", excludeConcepts)


disease 360668
celllines 50902
GeneOntology 620354
ModelAnatomy 151180
ncit 3024488
Ignored [('CARCINOMA', 8479), ('GENE EXPRESSION', 6136), ('MALIGNANT CELL', 4903), ('STEM CELL', 4319), ('CELL GROWTH', 3728), ('BREAST CARCINOMA', 3582), ('HEPATOCELLULAR CARCINOMA', 3477), ('LUNG CARCINOMA', 3452), ('CELL CYCLE PROCESS', 3241), ('HYPERSENSITIVITY', 3168)]


In [7]:
diseaseAnnotPMC = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pmc/disease.pmid", "disease")
celllinesAnnotPMC = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pmc/celllines.pmid", "celllines")
goAnnotPMC = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pmc/go.pmid", "GeneOntology")
modelanatAnnotPMC = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pmc/model_anatomy.pmid", "ModelAnatomy")

excludeConceptsPMC = makeExcludes(diseaseAnnotPMC + celllinesAnnotPMC +goAnnotPMC + modelanatAnnotPMC)

ncitAnnotPMC = read_annotation_file(MXPLORE_PATH+"mxresults/aggregated_pmc/ncit.pmid", "ncit", excludeConceptsPMC)


disease 4010966
celllines 847218
GeneOntology 11063712
ModelAnatomy 2283915
ncit 48370696
Ignored [('GENE EXPRESSION', 41185), ('METABOLIC PROCESS', 27059), ('CARCINOMA', 25274), ('STEM CELL', 22310), ('HYPERSENSITIVITY', 21792), ('BREAST CARCINOMA', 20346), ('CYTOPLASM', 20040), ('MALIGNANT CELL', 19674), ('CELL GROWTH', 18791), ('HOMEOSTATIC PROCESS', 18027)]


In [8]:
df = spark.createDataFrame(diseaseAnnot+celllinesAnnot+goAnnot+modelanatAnnot+ncitAnnot+diseaseAnnotPMC+celllinesAnnotPMC+goAnnotPMC+modelanatAnnotPMC+ncitAnnotPMC,
                           ["doc_id", "annotation", "concept_id", "concept", "sentence", "sentence_loc"])

In [9]:
df.show()

+--------+----------+----------+--------------------+-------------+------------+
|  doc_id|annotation|concept_id|             concept|     sentence|sentence_loc|
+--------+----------+----------+--------------------+-------------+------------+
|37809396|   disease| DOID:1749|squamous cell car...| 37809396.1.1|    {56, 79}|
|37809396|   disease| DOID:1749|squamous cell car...| 37809396.2.1|  {124, 147}|
|37809396|   disease|  DOID:305|           carcinoma| 37809396.1.1|    {70, 79}|
|37809396|   disease|  DOID:305|           carcinoma| 37809396.2.1|  {138, 147}|
|37809692|   disease|  DOID:162|              cancer| 37809692.1.1|    {28, 34}|
|37809692|   disease|  DOID:162|              cancer| 37809692.2.1|  {183, 189}|
|37809692|   disease|  DOID:162|              cancer| 37809692.2.1|  {226, 232}|
|37809692|   disease|  DOID:162|              cancer| 37809692.2.2|    {27, 33}|
|37809692|   disease|  DOID:162|              cancer| 37809692.2.2|    {88, 94}|
|37809692|   disease|  DOID:

In [10]:
predatoryTerms = [
"GO:0040007", "GO:0003675", "GO:0040007", "GO:0065007", "GO:0005488", "GO:0023052","GO:0010467",
"NCIT:C25966", "NCIT:C28378", "NCIT:C177693", "NCIT:C64542", "NCIT:C88924", "NCIT:C42791", "NCIT:C25214", "NCIT:C16342", "NCIT:C45971", "NCIT:C120360", "NCIT:C120363", "NCIT:C64382", "NCIT:C16608", "NCIT:C26549"
]
df = df.filter(~df.concept_id.isin(predatoryTerms))

In [11]:
df.groupBy("annotation", "concept_id", "concept").agg(
    count("*").alias("total_count")
).orderBy("total_count", ascending=False).show(n=10, truncate=False)

+------------+--------------+----------------------------+-----------+
|annotation  |concept_id    |concept                     |total_count|
+------------+--------------+----------------------------+-----------+
|disease     |DOID:162      |cancer                      |1293525    |
|ModelAnatomy|UBERON:2000098|proliferative region        |568554     |
|GeneOntology|GO:0006915    |apoptotic process           |373515     |
|ncit        |NCIT:C17557   |Apoptosis                   |367310     |
|GeneOntology|GO:0097194    |execution phase of apoptosis|363725     |
|ncit        |NCIT:C38784   |Apoptosis Pathway           |359424     |
|ncit        |NCIT:C40557   |Metastatic Lesion           |330532     |
|GeneOntology|GO:0032502    |developmental process       |330335     |
|ncit        |NCIT:C17021   |Protein                     |305890     |
|ncit        |NCIT:C17003   |Polymerase Chain Reaction   |294955     |
+------------+--------------+----------------------------+-----------+
only s

In [12]:
df.show(n=5)

+--------+----------+----------+--------------------+------------+------------+
|  doc_id|annotation|concept_id|             concept|    sentence|sentence_loc|
+--------+----------+----------+--------------------+------------+------------+
|37809396|   disease| DOID:1749|squamous cell car...|37809396.1.1|    {56, 79}|
|37809396|   disease| DOID:1749|squamous cell car...|37809396.2.1|  {124, 147}|
|37809396|   disease|  DOID:305|           carcinoma|37809396.1.1|    {70, 79}|
|37809396|   disease|  DOID:305|           carcinoma|37809396.2.1|  {138, 147}|
|37809692|   disease|  DOID:162|              cancer|37809692.1.1|    {28, 34}|
+--------+----------+----------+--------------------+------------+------------+
only showing top 5 rows



In [13]:
schema_sent = StructType([
    StructField("sent", StringType(), False),
    StructField('loc', StructType([StructField("start_", LongType(), True), StructField("end_", LongType(), False)])),
])

ev_loc_udf = udf(
    lambda x,y: (x, y),
    schema_sent
)

df = df.withColumn("sent_evidence", ev_loc_udf("sentence", "sentence_loc"))
df = df.drop("sentence")
df = df.drop("sentence_loc")

In [14]:
df.show(n=5)

+--------+----------+----------+--------------------+--------------------+
|  doc_id|annotation|concept_id|             concept|       sent_evidence|
+--------+----------+----------+--------------------+--------------------+
|37809396|   disease| DOID:1749|squamous cell car...|{37809396.1.1, {5...|
|37809396|   disease| DOID:1749|squamous cell car...|{37809396.2.1, {1...|
|37809396|   disease|  DOID:305|           carcinoma|{37809396.1.1, {7...|
|37809396|   disease|  DOID:305|           carcinoma|{37809396.2.1, {1...|
|37809692|   disease|  DOID:162|              cancer|{37809692.1.1, {2...|
+--------+----------+----------+--------------------+--------------------+
only showing top 5 rows



In [15]:
df_small = df.groupBy("doc_id", "annotation", "concept_id", "concept").agg(
    collect_set("sent_evidence").alias("sent_evidences")
)
df_small.show(n=5)

+--------+------------+--------------+--------------------+--------------------+
|  doc_id|  annotation|    concept_id|             concept|      sent_evidences|
+--------+------------+--------------+--------------------+--------------------+
|10760272|ModelAnatomy|UBERON:0001969|        blood plasma|[{10760272.2.5, {...|
|12576545|        ncit|     NCIT:C231|          Amino Acid|[{12576545.2.2, {...|
|15361871|GeneOntology|    GO:0040034|regulation of dev...|[{15361871.2.2, {...|
|15634332|        ncit|   NCIT:C17565|   Sequence Analysis|[{15634332.2.2, {...|
|15738415|        ncit|   NCIT:C12474|        B-Lymphocyte|[{15738415.2.6, {...|
+--------+------------+--------------+--------------------+--------------------+
only showing top 5 rows



In [16]:
df_small.write.parquet("/mnt/extproj/projekte/textmining/mx_feb24/mx_document_annotations", mode="overwrite")