In [1]:
import os
os.environ["JAVA_HOME"] = "/mnt/extproj/projekte/textmining/markus_tm_projs/jdk/openlogic-openjdk-11.0.22+7-linux-x64"

MXPLORE_PATH="/mnt/extproj/projekte/textmining/mxplore/"

from collections import Counter

In [2]:
from collections import defaultdict
from pyspark.sql import SparkSession

from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.sql import Window
import glob

In [3]:
if not "spark" in globals() or spark is None:
    print("Creating builder")
    spark = SparkSession.builder.\
            config("spark.executor.memory", "70g").\
            config("spark.driver.memory", "50g").\
            config("spark.memory.offHeap.enabled",True).\
            config("spark.memory.offHeap.size","16g").\
            config("spark.sql.shuffle.partitions", 300).\
            appName('mirexplore').getOrCreate()
    
#spark.conf.set("spark.executor.memory", "70g")
#spark.conf.set("spark.driver.memory", "50g")
#spark.conf.set("spark.memory.offHeap.enabled",True)
#spark.conf.set("spark.memory.offHeap.size","16g")  
#spark.conf.set("spark.sql.shuffle.partitions", 300)

Creating builder




In [4]:
df_int = spark.read.parquet(MXPLORE_PATH+"/mxresults/mx_mirna_gene_consensus_parquet/")
df_int = df_int.withColumn("doc_id", explode("evidence_documents"))

In [5]:
relDocIDs = set([x["doc_id"] for x in df_int.select("doc_id").collect()])
print(len([x for x in relDocIDs if x.startswith("PMC")]))

55331


In [6]:
def read_date_file(inputpath, relevantDocIDs):
    db_entries = defaultdict(list)
    ignoredConcepts = Counter()


    for infile in glob.glob("{}/*.author".format(inputpath)):
        with open(infile, "r") as fin:
            
            for iline, line in enumerate(fin):
                aline = line.split("\t")
                aline = [x.strip() for x in aline]

                docID = aline[0]
                
                if not docID in relevantDocIDs:
                    continue
                
                if len(aline) < 4:
                    continue
                
                first, middle, last = aline[1], aline[2], aline[3]
                                
                db_entries[docID].append( (docID, first, last ) )
                
    db_records = []
    for x in db_entries:
        elems = db_entries[x]
        
        if len(elems) >= 2:
            db_records.append(elems[0])
            db_records.append(elems[-1])
        else:
            db_records.append(elems[0])
                
    return db_records


In [7]:
docAuthors = list(set(read_date_file("/mnt/extproj/projekte/textmining/pubmed_feb24/", relDocIDs)))
print(len(docAuthors))

93425


In [8]:
docAuthorsPMC = list(set(read_date_file("/mnt/extproj/projekte/textmining/pmc_feb24/oa_comm/", relDocIDs)))
print(len(docAuthorsPMC))

109548


In [9]:
df = spark.createDataFrame(docAuthors+docAuthorsPMC, ["doc_id", "firstname", "lastname"])

In [10]:
df.show()

+--------+---------+---------+
|  doc_id|firstname| lastname|
+--------+---------+---------+
|28683304|    Niraj|     Shah|
|34685605| Sung-Lin|       Hu|
|34761332|   Paresh|Prajapati|
|24307102|      Rui|       Wu|
|30151888|      Hui|    Huang|
|37762652|    Zhiye|     Zhao|
|22311119| Yukiharu|  Hiyoshi|
|34743206|   Hubert|   Fleury|
|26826389| Jingjing|      Liu|
|36322021| Xiaolong|       Ni|
|23286334|    Carol|   Moreno|
|31002141|      M-F|   Zhuang|
|26309499|  Ingemar|  Ernberg|
|35272550|      Fan|    Zhang|
|31415795|      Jun|     Shao|
|31494298|    Shuai|      Guo|
|34943951|Christoph| Hoffmann|
|30945557|      Jin|       Xu|
|21321078|  Jolyane|  Meloche|
|34917202| Zhongbao|     Ruan|
+--------+---------+---------+
only showing top 20 rows



In [11]:
df.write.parquet(MXPLORE_PATH+"/mxresults/mx_document_pubauthors", mode="overwrite")

In [12]:
df.orderBy("doc_id").show()

+--------+-------------+------------+
|  doc_id|    firstname|    lastname|
+--------+-------------+------------+
|10760272|            B|       Zheng|
|10760272|            D|        Chen|
|12576545|      Daniela|       Corda|
|12576545|Christopher P|      Berrie|
|12812784|       Victor|      Ambros|
|12812784|   Nicholas S|       Sokol|
|15003116|          Ian|  Pitha-Rowe|
|15003116|         Eric|        Moss|
|15361871|        Aadel|   Chaudhuri|
|15361871|         John|    Obenauer|
|15504739|         Ravi|        Jain|
|15504739|      Bridget|       Lollo|
|15538371|    Matthew N|         Poy|
|15538371|       Satoru|    Kuwajima|
|15634332|     Michel J|       Weber|
|15648093|       Andrea|      Luchin|
|15648093|     Ettore C|degli Uberti|
|15738415|       Liping|         Sun|
|15738415|      Mario F|       Gomez|
|15766526|      Hermann|        Gram|
+--------+-------------+------------+
only showing top 20 rows

