In [1]:
import os, sys
import urllib
import zipfile
import gzip
import pandas as pd

from collections import defaultdict, Counter
from goatools.anno.gaf_reader import GafReader
from goatools.obo_parser import GODag

import networkx as nx

data_dir = "../data/"

In [2]:
! pwd

/usr/local/hdd3/mikg/scripts


In [3]:
def download_and_unzip(download_url_link, dir_path, zipped_filename,destination_dir_name, unzip=True, force_zip=False, force_gz=False):
    #https://www.tutorialsbuddy.com/download-and-unzip-a-zipped-file-in-python
    print("Download starting")

    urllib.request.urlretrieve(
        download_url_link, os.path.join(dir_path, zipped_filename)
    )
    print("Download complete")

    if unzip:
        print("unzipping file starting")
    
        if (zipped_filename.endswith(".zip") and not force_gz) or force_zip:
            with zipfile.ZipFile(os.path.join(dir_path, zipped_filename), "r") as zip_file:
                zip_file.extractall(os.path.join(dir_path, destination_dir_name))
        elif (zipped_filename.endswith(".gz") and not force_zip) or force_gz:
            print("gzfile")
            with gzip.GzipFile(os.path.join(dir_path, zipped_filename), "rb") as zip_file:
                destname = os.path.join(dir_path, destination_dir_name, os.path.basename(zipped_filename).replace(".gz", ""))
                print(destname)
                with open(destname, "wb") as fout:
                    fout.write(zip_file.read())
        else:
            raise NotImplementedError("NO CASE")
            
    
    print("unzipping complete")

In [4]:
npinterFile = os.path.join(data_dir,"interaction_NPInterv5.txt")

if not os.path.exists(npinterFile):
    download_and_unzip("http://bigdata.ibp.ac.cn/npinter5/download/file/interaction_NPInterv5.txt.gz", ".", os.path.join(data_dir,"interaction_NPInterv5.txt.gz"), data_dir, force_zip=True)

Download starting
Download complete
unzipping file starting
unzipping complete


In [5]:
! head "../data/interaction_NPInterv5.txt"

interID	ncName	ncID	ncType	tarName	tarID	tarType	interDescription	experiment	reference	organism	tissueOrCell	tag	class	level	datasource
ncRI-40000001	4.5S	NONOTHNOBEDT000074	lncRNA	rpoD	P0A6M8	protein	Following translocation, 4.5S RNA replaces 23S rRNA as a binding site for EF-G. This replacement promotes the dissociation of EF-G from the ribosome. GDP has positive effects on the RNA binding.The conserved decanucleotide sequence (5-GAAGCAGCCA-3) of 4.5S RNA, competed with the 23S rRNA region defining the EF-G-binding site.	Immunoprecipitation experiments;EMSA;SDS-polyacrylamide gel electrophoresis	8662727;8107852;1281314	Escherichia coli	-	ncRNA-protein binding	binding	RNA-Protein	Literature mining
ncRI-40000002	6S	NONOTHNOBEDT000078	lncRNA	rpoB	M1FYN7	protein	UV crosslinking experiments revealed that 6S RNA contacts the sigma70 and beta/beta' subunits.	cross-linking	10892648	Escherichia coli	-	ncRNA-protein binding	binding	RNA-Protein	Literature mining
ncRI-40000003	6S	NONOTHNOBEDT000

In [6]:
df = pd.read_csv("../data/interaction_NPInterv5.txt", sep="\t")
df = df[df.organism == "Homo sapiens"].copy()

  df = pd.read_csv("../data/interaction_NPInterv5.txt", sep="\t")


In [7]:
df.shape

(1423235, 16)

In [8]:
df.head()

Unnamed: 0,interID,ncName,ncID,ncType,tarName,tarID,tarType,interDescription,experiment,reference,organism,tissueOrCell,tag,class,level,datasource
3,ncRI-40000004,7SK,NONHSAG040596,lncRNA,MYC,NM_002467,protein,Electrophoretic mobility shift experiment indi...,EMSA,9018369,Homo sapiens,-,regulatory;promoter as action site,regulatory,RNA-RNA,Literature mining
4,ncRI-40000005,7SK,NONHSAG040596,lncRNA,ABO,NM_020469,protein,Promoters pU6 and p7SK proved to express high ...,-,22522162,Homo sapiens,-,regulatory,regulatory,RNA-RNA,Literature mining
5,ncRI-40000006,7SK,NONHSAG040596,lncRNA,HEXIM1,O94992,protein,Binding of the 7SK snRNA turns the HEXIM1 prot...,Yeast two-hybrid analysis;Immunofluorescence;M...,15201869;15994294;17671421;20675720;20926576;1...,Homo sapiens,-,ncRNA-protein binding,binding,RNA-Protein,Literature mining
6,ncRI-40000007,7SK,NONHSAG040596,lncRNA,Oct1,P20263,protein,PTF and Oct-1 enhance transcription from the 7...,-,1535687,Homo sapiens,-,ncRNA is regulated,regulatory,DNA-TF,Literature mining
7,ncRI-40000008,7SK,NONHSAG040596,lncRNA,Tat,P17735,protein,Tat efficiently replaces HEXIM1 on the 7SK snR...,-,20976203;20471949,Homo sapiens,-,ncRNA-protein binding,binding,RNA-Protein,Literature mining


In [9]:
colMap = {
    'binding; regulatory': 'binding;regulatory',
    'binding;': "binding"
}

for x in set(df["class"]):
    if not x in colMap:
        colMap[x]=x
colMap

{'binding; regulatory': 'binding;regulatory',
 'binding;': 'binding',
 'binding': 'binding',
 'expression correlation': 'expression correlation',
 '-': '-',
 'binding;regulatory': 'binding;regulatory',
 'coexpression': 'coexpression',
 'regulatory': 'regulatory'}

In [10]:
df["class"]=df["class"].map(colMap)

In [11]:
Counter(df["class"])

Counter({'regulatory': 5896,
         'binding': 1297058,
         'coexpression': 60,
         'expression correlation': 124,
         'binding;regulatory': 120094,
         '-': 3})

In [12]:
Counter(df["tarType"])

Counter({'protein': 516016,
         'miRNA': 123162,
         'lncRNA': 8414,
         'mRNA': 765436,
         'DNA': 3,
         'snRNA': 1613,
         'Protein': 2,
         'circRNA': 283,
         'ncRNA': 4678,
         'pseudogene': 3455,
         'snoRNA': 170,
         'Pseudogene': 1,
         'TF': 2})

In [13]:
Counter(df["ncType"])

Counter({'lncRNA': 643173,
         'miRNA': 756147,
         'ncRNA': 6669,
         'circRNA': 987,
         'vtRNAs': 1,
         'snoRNA': 215,
         'snRNA': 1229,
         'mRNA': 11410,
         'protein': 21,
         'pseudogene': 3383})

In [14]:
type2kgtype = {'miRNA': "ncRNA",
         'lncRNA': "ncRNA",
         'snoRNA': "ncRNA",
         'mRNA': "gene",
         'snRNA': "ncRNA",
         'ncRNA': "ncRNA",
         'pseudogene': "gene",
         'Pseudogene': "gene",
         'circRNA': "ncRNA",
         'protein': "gene",
         'Protein': "gene",
         'sRNA': "ncRNA",
         'vtRNAs': "ncRNA",
         'piRNAs': "ncRNA",
        "DNA": None,
              "TF": "gene"}

In [15]:
df[df["tarType"] == "DNA"]

Unnamed: 0,interID,ncName,ncID,ncType,tarName,tarID,tarType,interDescription,experiment,reference,organism,tissueOrCell,tag,class,level,datasource
969296,ncRI-40969297,AC026904.1,ENSG00000233858,lncRNA,the promoter region of SLUG,-,DNA,these results supported the notion that AC0269...,Chromatin isolation by RNA purification (ChIRP...,29774079,Homo sapiens,MDA-MB-231,regulatory,binding,RNA-DNA,Literature Mining
969402,ncRI-40969403,SNHG1,NONHSAG008552,lncRNA,SNHG1 locus,-,DNA,The existence of these regulatory signals indi...,chromatin isolation by RNA purification (ChIRP...,28825722,Homo sapiens,HCT116,regulatory,regulatory,RNA-DNA,Literature Mining
969724,ncRI-40969725,HOTAIR,NONHSAG011264,lncRNA,promoter of miR-34a,-,DNA,EZH2 coupled with HOTAIR to inhibit miR-34a in...,ChIRP assay,27594424,Homo sapiens,SW1990,promoter as action site;regulatory,binding,RNA-DNA,Literature Mining


In [16]:
import numpy as np

kg = nx.DiGraph()
added = 0

for ri, row in df.iterrows():

    ncName = row["ncName"]
    ncID = row["ncID"]
    ncType = row["ncType"]

    tarName = row["tarName"]
    tarID = row["tarID"]
    tarType = row["tarType"]

    intClass = row["class"]
    intID = row["interID"]
    intSource = row["datasource"]

    if type(ncID) == float:
        if np.isnan(ncID):
            ncID = ncName
            
    if type(tarID) == float:
        if np.isnan(tarID):
            tarID = tarName


    if ncName.startswith("hsa-"):
        ncName = ncName[4:]

    kgNcType = type2kgtype[ncType]
    kgTarType = type2kgtype[tarType]

    if not "gene" in [kgNcType, kgTarType]:
        continue
        
    if None in [kgNcType, kgTarType]:
        continue

    if added % 100000 == 0:
        print(ri, ncName, ncID, ncType, tarName, tarID, tarType, intClass, intID)

    if not ncName in kg.nodes:
        kg.add_node(ncName, id=ncID, name=ncName, type=kgNcType, biotype=ncType, score=0)

    if not tarName in kg.nodes:
        kg.add_node(tarName, id=tarID, name=tarName, type=kgTarType, biotype=tarType, score=0)

    kg.add_edge(ncName, tarName, type="interacts", score=0, source="npinter5", source_type=intClass, source_id=intID)

    added += 1

print("added", added, "entries")

3 7SK NONHSAG040596 lncRNA MYC NM_002467 protein regulatory ncRI-40000004
126456 DNAJA1 NONHSAG052013 lncRNA TARBP2 Q15633 protein binding ncRI-40126457
423774 RP11-346I3.4 NONHSAG099517 lncRNA CSTF2T Q9H0L4 protein binding ncRI-40423775
523774 CTB-152G17.6 NONHSAG097241 lncRNA HLTF Q14527 protein binding ncRI-40523775
639532 miR-16-1-3p MI0000070 miRNA OAZ1 ENSG00000104904 mRNA binding ncRI-40639533
739750 miR-4323 MI0015853 miRNA OTUD7B ENSG00000163113 mRNA binding ncRI-40739751
840540 miR-641 MI0003656 miRNA GORASP2 ENSG00000115806 mRNA binding ncRI-40840541
959280 miR-590-3p MI0003602 miRNA TMEM87A ENSG00000103978 mRNA binding ncRI-40959281
1177748 AC016168.2 ENSG00000267568 lncRNA IGF2BP1 Q9NZI8 protein binding ncRI-50076504
1295934 miR-30b-5p MI0000441 miRNA SMARCA5 ENSG00000153147 mRNA binding ncRI-50193818
1395934 miR-449a MI0001648 miRNA EIF5 ENSG00000100664 mRNA binding ncRI-50293818
1495934 miR-188-3p MI0000484 miRNA TUSC2 ENSG00000114383 mRNA binding ncRI-50393818
1595934 m

In [17]:
Counter([kg.nodes[x]["type"] for x in kg.nodes])

Counter({'ncRNA': 56739, 'gene': 12581})