In [1]:
%load_ext autoreload
%autoreload 2

import os, sys
import numpy
from collections import defaultdict, Counter
import datetime


In [2]:
print(os.getcwd())
sys.path.insert(0, str(os.path.dirname(os.path.realpath(os.getcwd()))))
print(sys.path)

/mnt/f/dev/git/miRExplore/python/nbs
['/mnt/f/dev/git/miRExplore/python', '/mnt/f/dev/git/miRExplore/python/nbs', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.8/dist-packages/IPython/extensions', '/home/mjoppich/.ipython']


In [3]:
%autoreload 2
from textdb.PubmedDateDB import PubmedDateDB
%autoreload 2
from synonymes.mirnaID import miRNA, miRNAPART
%autoreload 2
from textdb.MiGenRelDB import MiGenRelDB,MiRGeneRel
%autoreload 2
from synonymes.GeneOntology import GeneOntology
%autoreload 2
from textdb.PMID2XDB import PMID2XDB


In [4]:
pmidBase ="/mnt/f/dev/data/pmid_jun2020/aggregated_pmid/"
obodir = "/mnt/f/dev/data/pmid_jun2020/obodir/"

In [6]:
from utils.tmutils import normalize_gene_names
normGeneSymbols = normalize_gene_names(path="/mnt/d/owncloud/data/miRExplore/obodir/" + "/hgnc_no_withdrawn.syn")

In [7]:
print("Loading hsa")
mirelPMIDhsa = MiGenRelDB.loadFromFile(pmidBase + "/mirna_gene.hsa.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True, stopAfter=-1)
print("Loading mmu")
mirelPMIDmmu = MiGenRelDB.loadFromFile(pmidBase + "/mirna_gene.mmu.pmid", ltype="mirna", rtype="gene", normGeneSymbols=normGeneSymbols, switchLR=True, stopAfter=-1)

relDBs = [mirelPMIDhsa, mirelPMIDmmu]

Loading hsa
Gene Symbols Normalized 77476
Loaded file /mnt/f/dev/data/pmid_jun2020/aggregated_pmid//mirna_gene.hsa.pmid
Accepted Doc IDs 40682
Rejected Doc IDs 0
Seen genes 7906
Seen miRNAs 2116
Seen Harm. miRNAs 1615
Loading mmu
Gene Symbols Normalized 12375
Loaded file /mnt/f/dev/data/pmid_jun2020/aggregated_pmid//mirna_gene.mmu.pmid
Accepted Doc IDs 36823
Rejected Doc IDs 0
Seen genes 6431
Seen miRNAs 2048
Seen Harm. miRNAs 1578


In [8]:
miRNAGenePMIDs = set()
for rdb in relDBs:
    for rpmid in rdb.get_evidence_docids():
        miRNAGenePMIDs.add(rpmid)

In [9]:
len(miRNAGenePMIDs)

35699

In [10]:
diseaseObo = GeneOntology(obodir + "/doid.obo")

#{'group': 'disease', 'termid': 'DOID:1936', 'name': 'atherosclerosis'}
#{'group': 'disease', 'termid': 'DOID:2349', 'name': 'arteriosclerosis'}
#{'group': 'disease', 'termid': 'DOID:1287', 'name': 'cardiovascular system disease'},
elemTerm = diseaseObo['DOID:1936']
elemTerms = [x.term.id for x in elemTerm.getAllChildren()] + [elemTerm.id]

cvTerm = diseaseObo['DOID:1287']
cvTerms = [x.term.id for x in cvTerm.getAllChildren()] + [cvTerm.id] + elemTerms

pmid2disease = PMID2XDB.loadFromFile(pmidBase + "/disease.pmid", diseaseObo, miRNAGenePMIDs)

0
10000
no parent 2382


In [11]:
# number of genes with interaction
allGenes = set()

for rdb in relDBs:
    allGenes = allGenes.union(set(rdb.all_ltypes))

print("Number of genes with interaction", len(allGenes))

Number of genes with interaction 7105


In [12]:
# number of miRNAs with interaction
## restrict to miR-x
allMirnas = set()
for rdb in relDBs:
    for mirna in rdb.all_rtypes:

        try:
            mirObj = miRNA(mirna)
            allMirnas.add(mirObj.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR]))

        except:

            print(mirna)
            exit(-1)

print("Number of mirnas with interaction", len(allMirnas))
print("DO NOT USE THIS NUMBER!!!!")

Number of mirnas with interaction 1119
DO NOT USE THIS NUMBER!!!!


In [13]:
for rdb in relDBs:
    for midx, mirna in enumerate(rdb.all_rtypes):
        print(mirna, type(mirna))
        
        if midx > 7:
            break
    

miR-591 <class 'str'>
mmu-miR-107 <class 'str'>
hsa-miR-338 <class 'str'>
miR-145a <class 'str'>
miR-320 <class 'str'>
hsa-miR-661 <class 'str'>
miR-605 <class 'str'>
miR-516a <class 'str'>
hsa-miR-377 <class 'str'>
miR-591 <class 'str'>
hsa-miR-338 <class 'str'>
miR-145a <class 'str'>
miR-320 <class 'str'>
hsa-miR-661 <class 'str'>
miR-605 <class 'str'>
miR-516a <class 'str'>
hsa-miR-377 <class 'str'>
hsa-miR-1537 <class 'str'>


In [14]:
seenFullMirs = set()
seenShortMirs = set()

for rdb in relDBs:

    for gidx, gene in enumerate(rdb.ltype2rel):

        for rel in rdb.ltype2rel[gene]:
            
            
            mirstr = rel.orig_names[1]
            
            assert(mirstr == rel.rid)
            
            omir = miRNA(mirstr)
            fullMirna = omir.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR, miRNAPART.MATURE_SEQS, miRNAPART.ARM])
            shortMirna = omir.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR])
            
            #print(mirstr, fullMirna)
            
            seenFullMirs.add(fullMirna)
            seenShortMirs.add(shortMirna)

            
print("Short miRNAs", len(seenShortMirs))
print("Full miRNAs", len(seenFullMirs))

Short miRNAs 1119
Full miRNAs 1494


In [15]:


def calculateStatistics(filter=None):

    interactionsWithDisease = set()

    interactionsWithAthero = set()
    atheroPubmeds = set()
    cvPubmeds = set()
    allPubmeds = set()

    interactionsWithCV = set()
    totalMirnas = set()
    totalMirnasFull = set()
    totalGenes = set()

    totalInteractions = set()
    
    interactionByRegDir = defaultdict(set)

    for rdb in relDBs:

        for gene in rdb.ltype2rel:

            for rel in rdb.ltype2rel[gene]:

                #print(rel)
                #assert(1==0)


                if filter != None:
                    
                    if filter(rel):
                        continue

                
                        
                mirObj = miRNA(rel.rid)
                
                fullMirna = mirObj.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR, miRNAPART.MATURE_SEQS, miRNAPART.ARM])
                totalMirnasFull.add(fullMirna)

                baseMirna = mirObj.getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR])
                intTuple = (rel.lid, baseMirna)

                totalInteractions.add(intTuple)
                
                interactionByRegDir[(rel.assocInt, rel.assocCat)].add(intTuple)
                
                totalMirnas.add(intTuple[1])
                totalGenes.add(intTuple[0])

                docID = rel.docid
                retVal = pmid2disease.getDOC(docID)

                allPubmeds.add(docID)

                if retVal != None:


                    interactionsWithDisease.add(intTuple)

                    for docDisease in retVal:

                        if docDisease['termid'] in elemTerms:
                            interactionsWithAthero.add(intTuple)
                            atheroPubmeds.add(docID)

                        if docDisease['termid'] in cvTerms:
                            interactionsWithCV.add(intTuple)
                            cvPubmeds.add(docID)
                            
    print("Different miRNAs", len(totalMirnas))
    print("Different miRNAs (full)", len(totalMirnasFull))
    print("Different genes", len(totalGenes))

    print("total Interactions", len(totalInteractions))
    print("total mirnas in interactions", len(set([x[1] for x in totalInteractions])))


    print("total interactions with disease", len(interactionsWithDisease))
    print("total mirnas in interactions with disease", len(set([x[1] for x in interactionsWithDisease])))

    atheroMirnas = set([x[1] for x in interactionsWithAthero])
    atheroGenes = set([x[0] for x in interactionsWithAthero])

    print("total interactions with athero", len(interactionsWithAthero))
    print("total mirnas in interactions with athero", len(atheroMirnas))
    print("total genes in interactions with athero", len(atheroGenes))
    print("total pubmeds for interactions with athero", len(atheroPubmeds))
    print("total pubmeds for interactions with cv", len(cvPubmeds))
    print("total pubmeds for interactions", len(allPubmeds))

    print("total interactions with cv", len(interactionsWithCV))
    print("total mirnas in interactions with cv", len(set([x[1] for x in interactionsWithCV])))
    
    print()
    
    for x in sorted(interactionByRegDir):
        print(x, len(interactionByRegDir[x]))

In [16]:
calculateStatistics()

Different miRNAs 1119
Different miRNAs (full) 1494
Different genes 7105
total Interactions 56369
total mirnas in interactions 1119
total interactions with disease 45599
total mirnas in interactions with disease 1022
total interactions with athero 1091
total mirnas in interactions with athero 156
total genes in interactions with athero 482
total pubmeds for interactions with athero 409
total pubmeds for interactions with cv 2657
total pubmeds for interactions 35699
total interactions with cv 6046
total mirnas in interactions with cv 431

('GENE_MIR', 'DOWN') 6807
('GENE_MIR', 'NEU') 11892
('GENE_MIR', 'UP') 6657
('MIR_GENE', 'CHNAGE') 5
('MIR_GENE', 'DOWN') 23789
('MIR_GENE', 'NEU') 34992
('MIR_GENE', 'UP') 11511


In [17]:
def filterFunc(rel):
    relOrgs = rel.orgs
    if relOrgs == None:
        relOrgs = set()

    if not ('mmu' in relOrgs or 'hsa' in relOrgs):
        return True
    
    return False

calculateStatistics(filterFunc)

Different miRNAs 879
Different miRNAs (full) 1140
Different genes 5120
total Interactions 29672
total mirnas in interactions 879
total interactions with disease 23464
total mirnas in interactions with disease 793
total interactions with athero 664
total mirnas in interactions with athero 123
total genes in interactions with athero 327
total pubmeds for interactions with athero 243
total pubmeds for interactions with cv 1207
total pubmeds for interactions 16024
total interactions with cv 2942
total mirnas in interactions with cv 309

('GENE_MIR', 'DOWN') 3360
('GENE_MIR', 'NEU') 5348
('GENE_MIR', 'UP') 3269
('MIR_GENE', 'DOWN') 12803
('MIR_GENE', 'NEU') 17520
('MIR_GENE', 'UP') 6006
