In [1]:
%load_ext autoreload
%autoreload 2

import os, sys
import numpy
from collections import defaultdict, Counter
import datetime


In [2]:
print(os.getcwd())
sys.path.insert(0, str(os.path.dirname(os.path.realpath(os.getcwd()))))
print(sys.path)

/mnt/d/dev/git/miRExplore/python/nbs
['/mnt/d/dev/git/miRExplore/python', '/mnt/d/dev/git/miRExplore/python/nbs', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.8/dist-packages/IPython/extensions', '/home/mjoppich/.ipython']


In [3]:
%autoreload 2
from textdb.PubmedDateDB import PubmedDateDB
%autoreload 2
from synonymes.mirnaID import miRNA, miRNAPART
%autoreload 2
from textdb.MiGenRelDB import MiGenRelDB,MiRGeneRel
%autoreload 2
from synonymes.GeneOntology import GeneOntology
%autoreload 2
from textdb.PMID2XDB import PMID2XDB


In [4]:
pmidBase ="/mnt/d/dev/data/pmid_jun2020/aggregated_pmid/"
obodir = "/mnt/d/dev/data/pmid_jun2020/obodir/"

In [5]:
from utils.tmutils import normalize_gene_names
normGeneSymbols = normalize_gene_names(path="/mnt/d/owncloud/data/miRExplore/obodir/" + "/hgnc_no_withdrawn.syn")

In [6]:
print("Loading hsa")
mirelPMIDhsa = MiGenRelDB.loadFromFile(pmidBase + "/mirna_gene.hsa.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True, stopAfter=-1)
print("Loading mmu")
mirelPMIDmmu = MiGenRelDB.loadFromFile(pmidBase + "/mirna_gene.mmu.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True, stopAfter=-1)

relDBs = [mirelPMIDhsa, mirelPMIDmmu]

Loading hsa
Gene Symbols Normalized 77476
Loaded file /mnt/d/dev/data/pmid_jun2020/aggregated_pmid//mirna_gene.hsa.pmid
Accepted Doc IDs 40682
Rejected Doc IDs 0
Seen genes 7906
Seen miRNAs 2116
Seen Harm. miRNAs 1615
Loading mmu
Gene Symbols Normalized 12366
Loaded file /mnt/d/dev/data/pmid_jun2020/aggregated_pmid//mirna_gene.mmu.pmid
Accepted Doc IDs 36942
Rejected Doc IDs 0
Seen genes 6428
Seen miRNAs 2049
Seen Harm. miRNAs 1578


In [7]:
miRNAGenePMIDs = set()
for rdb in relDBs:
    for rpmid in rdb.get_evidence_docids():
        miRNAGenePMIDs.add(rpmid)

In [8]:
len(miRNAGenePMIDs)

36251

In [9]:
diseaseObo = GeneOntology(obodir + "/doid.obo")

#{'group': 'disease', 'termid': 'DOID:1936', 'name': 'atherosclerosis'}
#{'group': 'disease', 'termid': 'DOID:2349', 'name': 'arteriosclerosis'}
#{'group': 'disease', 'termid': 'DOID:1287', 'name': 'cardiovascular system disease'},
elemTerm = diseaseObo['DOID:1936']
elemTerms = [x.term.id for x in elemTerm.getAllChildren()] + [elemTerm.id]

cvTerm = diseaseObo['DOID:1287']
cvTerms = [x.term.id for x in cvTerm.getAllChildren()] + [cvTerm.id] + elemTerms

pmid2disease = PMID2XDB.loadFromFile(pmidBase + "/disease.pmid", diseaseObo, miRNAGenePMIDs)

0
10000
no parent 2382


In [10]:
# number of genes with interaction
allGenes = set()

for rdb in relDBs:
    allGenes = allGenes.union(set(rdb.all_ltypes))

print("Number of genes with interaction", len(allGenes))

Number of genes with interaction 7176


In [11]:
# number of miRNAs with interaction
## restrict to miR-x
allMirnas = set()
for rdb in relDBs:
    for mirna in rdb.all_rtypes:

        try:
            mirObj = miRNA(mirna)
            allMirnas.add(mirObj.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR]))

        except:

            print(mirna)
            exit(-1)

print("Number of mirnas with interaction", len(allMirnas))
print("DO NOT USE THIS NUMBER!!!!")

Number of mirnas with interaction 1121
DO NOT USE THIS NUMBER!!!!


In [12]:
for rdb in relDBs:
    for midx, mirna in enumerate(rdb.all_rtypes):
        print(mirna, type(mirna))
        
        if midx > 7:
            break
    

miR-521 <class 'str'>
miR-144 <class 'str'>
miR-494-mediated <class 'str'>
hsa-miR-627 <class 'str'>
miR-466d <class 'str'>
miR-466l <class 'str'>
miR-497 <class 'str'>
miR-3651 <class 'str'>
miR-7706 <class 'str'>
miR-144 <class 'str'>
miR-494-mediated <class 'str'>
miR-466d <class 'str'>
miR-466l <class 'str'>
miR-497 <class 'str'>
miR-3651 <class 'str'>
miR-7706 <class 'str'>
miR-613 <class 'str'>
miR-669c <class 'str'>


In [17]:
seenFullMirs = set()
seenShortMirs = set()

for rdb in relDBs:

    for gidx, gene in enumerate(rdb.ltype2rel):

        for rel in rdb.ltype2rel[gene]:
            
            
            mirstr = rel.orig_names[1]
            
            assert(mirstr == rel.rid)
            
            omir = miRNA(mirstr)
            fullMirna = omir.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR, miRNAPART.MATURE_SEQS, miRNAPART.ARM])
            shortMirna = omir.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR])
            
            #print(mirstr, fullMirna)
            
            seenFullMirs.add(fullMirna)
            seenShortMirs.add(shortMirna)

            
print("Short miRNAs", len(seenShortMirs))
print("Full miRNAs", len(seenFullMirs))

Short miRNAs 1121
Full miRNAs 1498


In [25]:


def calculateStatistics(filter=None):

    interactionsWithDisease = set()

    interactionsWithAthero = set()
    atheroPubmeds = set()
    cvPubmeds = set()
    allPubmeds = set()

    interactionsWithCV = set()
    totalMirnas = set()
    totalMirnasFull = set()
    totalGenes = set()

    totalInteractions = set()
    
    interactionByRegDir = defaultdict(set)

    for rdb in relDBs:

        for gene in rdb.ltype2rel:

            for rel in rdb.ltype2rel[gene]:

                #print(rel)
                #assert(1==0)


                if filter != None:
                    
                    if filter(rel):
                        continue

                
                        
                mirObj = miRNA(rel.rid)
                
                fullMirna = mirObj.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR, miRNAPART.MATURE_SEQS, miRNAPART.ARM])
                totalMirnasFull.add(fullMirna)

                baseMirna = mirObj.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR])
                intTuple = (rel.lid, baseMirna)

                totalInteractions.add(intTuple)
                
                interactionByRegDir[(rel.assocInt, rel.assocCat)].add(intTuple)
                
                totalMirnas.add(intTuple[1])
                totalGenes.add(intTuple[0])

                docID = rel.docid
                retVal = pmid2disease.getDOC(docID)

                allPubmeds.add(docID)

                if retVal != None:


                    interactionsWithDisease.add(intTuple)

                    for docDisease in retVal:

                        if docDisease['termid'] in elemTerms:
                            interactionsWithAthero.add(intTuple)
                            atheroPubmeds.add(docID)

                        if docDisease['termid'] in cvTerms:
                            interactionsWithCV.add(intTuple)
                            cvPubmeds.add(docID)
                            
    print("Different miRNAs", len(totalMirnas))
    print("Different miRNAs (full)", len(totalMirnasFull))
    print("Different genes", len(totalGenes))

    print("total Interactions", len(totalInteractions))
    print("total mirnas in interactions", len(set([x[1] for x in totalInteractions])))


    print("total interactions with disease", len(interactionsWithDisease))
    print("total mirnas in interactions with disease", len(set([x[1] for x in interactionsWithDisease])))

    atheroMirnas = set([x[1] for x in interactionsWithAthero])
    atheroGenes = set([x[0] for x in interactionsWithAthero])

    print("total interactions with athero", len(interactionsWithAthero))
    print("total mirnas in interactions with athero", len(atheroMirnas))
    print("total genes in interactions with athero", len(atheroGenes))
    print("total pubmeds for interactions with athero", len(atheroPubmeds))
    print("total pubmeds for interactions with cv", len(cvPubmeds))
    print("total pubmeds for interactions", len(allPubmeds))

    print("total interactions with cv", len(interactionsWithCV))
    print("total mirnas in interactions with cv", len(set([x[1] for x in interactionsWithCV])))
    
    print()
    
    for x in sorted(interactionByRegDir):
        print(x, len(interactionByRegDir[x]))

In [26]:
calculateStatistics()

Different miRNAs 1121
Different miRNAs (full) 1498
Different genes 7176
total Interactions 58814
total mirnas in interactions 1121
total interactions with disease 47466
total mirnas in interactions with disease 1024
total interactions with athero 1161
total mirnas in interactions with athero 153
total genes in interactions with athero 491
total pubmeds for interactions with athero 412
total pubmeds for interactions with cv 2655
total pubmeds for interactions 36251
total interactions with cv 6173
total mirnas in interactions with cv 420

('GENE_MIR', 'DOWN') 7056
('GENE_MIR', 'NEU') 12263
('GENE_MIR', 'UP') 6816
('MIR_GENE', 'CHNAGE') 5
('MIR_GENE', 'DOWN') 25139
('MIR_GENE', 'NEU') 36647
('MIR_GENE', 'UP') 12461


In [27]:
def filterFunc(rel):
    relOrgs = rel.orgs
    if relOrgs == None:
        relOrgs = set()

    if not ('mmu' in relOrgs or 'hsa' in relOrgs):
        return True
    
    return False

calculateStatistics(filterFunc)

Different miRNAs 881
Different miRNAs (full) 1143
Different genes 5180
total Interactions 31097
total mirnas in interactions 881
total interactions with disease 24580
total mirnas in interactions with disease 796
total interactions with athero 702
total mirnas in interactions with athero 122
total genes in interactions with athero 338
total pubmeds for interactions with athero 243
total pubmeds for interactions with cv 1207
total pubmeds for interactions 16259
total interactions with cv 3059
total mirnas in interactions with cv 302

('GENE_MIR', 'DOWN') 3472
('GENE_MIR', 'NEU') 5553
('GENE_MIR', 'UP') 3347
('MIR_GENE', 'DOWN') 13540
('MIR_GENE', 'NEU') 18373
('MIR_GENE', 'UP') 6542
