This notebook contains code related to downloading the dataset, and converting it into a list of tuples $(h,r,t)$ (corresponding to head, relation, tail). The list is stored in `data/dataset.tsv`. Next to it, we store lookup tables for the types of targets and regulators. (`data/target2type.csv` and `data/regulator2type.csv` respectively.)

In [None]:
!pip install gffutils

In [138]:
import os
from dataset import load_lnctard, df2nx
import pandas as pd
import gffutils
from tqdm import tqdm
import ast

### 🛒  Download and load dataset

In [2]:
!mkdir -p data
!wget https://lnctard.bio-database.com/downloadfile/lnctard2.0.zip -qO- | zcat > data/lnctard2.0.txt

load raw dataset, for some reason `utf-8` does not work for decoding, but `latin-1` seems to work

In [170]:
df = load_lnctard()

In [3]:
df.head()

Unnamed: 0,Regulator,Target,SearchregulatoryMechanism,RegulatorType,TargetType
0,LINC00313,miR-4429,ceRNA or sponge,lncRNA,miRNA
1,FAM83H-AS1,CDKN1A,epigenetic regulation,lncRNA,PCG
2,NEAT1,TGFB1,ceRNA or sponge,lncRNA,PCG
3,NEAT1,ZEB1,ceRNA or sponge,lncRNA,TF
4,ZFPM2-AS1,MIF,interact with protein,lncRNA,PCG


extract largest graph connection component

In [5]:
largest_cc = df2nx(
  df, head="Regulator", tail="Target",
  relation="SearchregulatoryMechanism",
  cc_mode="largest",
)

### 🛍️ Extract tuples and store dataset

In [6]:
# create tuples (h,r,t)
edgedata = largest_cc.edges.data("SearchregulatoryMechanism")
tuples = [(h,r,t) for h,t,r in edgedata] # swizzle t and r
tuples = pd.DataFrame(tuples, columns=["head","relation","tail"])
print("gathered",len(tuples),"tuples")
tuples.head()

gathered 6773 tuples


Unnamed: 0,head,relation,tail
0,LINC00313,ceRNA or sponge,miR-4429
1,LINC00313,transcriptional regulation,SOX2
2,LINC00313,ceRNA or sponge,MIR422A
3,LINC00313,ceRNA or sponge,FOSL2
4,LINC00313,epigenetic regulation,ALX4


In [7]:
tuples.to_csv("data/dataset.tsv", sep="\t", index=False)

### 🛍️ Extract and store node types from Gencode

In [None]:
!wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.annotation.gff3.gz -qO- | gunzip > data/gencode.v43.annotation.gff3

In [9]:
# stolen from http://daler.github.io/gffutils/#create-the-database
db_path = "data/human.db"
if not os.path.exists(db_path):
  print("database does not exist, creating new one.")
  db = gffutils.create_db(
    "data/gencode.v43.annotation.gff3",
    dbfn=db_path,
    merge_strategy='warning', # TODO: we may want to do merge here
    verbose=True,
  )
else:
  print("loading existing database")
  db = gffutils.FeatureDB(db_path)

loading existing database


In [155]:
# extract gene names and types from database
gene_names, gene_types = [], []
for feat in tqdm(
  db.execute("select f.attributes from features f"),
  total=next(db.execute("select count(*) as c from features"))["c"],
):
  attribs = ast.literal_eval(feat["attributes"])
  gene_names += attribs["gene_name"]
  gene_types += attribs["gene_type"]

100%|███████████████████████████████████████████████████████| 2448999/2448999 [03:30<00:00, 11627.02it/s]


In [168]:
gene_df = pd.DataFrame({"gene_name": gene_names, "gene_type": gene_types}).drop_duplicates(ignore_index=True)

In [169]:
gene_df.head()

Unnamed: 0,gene_name,gene_type
0,DDX11L2,lncRNA
1,DDX11L1,transcribed_unprocessed_pseudogene
2,WASH7P,unprocessed_pseudogene
3,MIR6859-1,miRNA
4,MIR1302-2HG,lncRNA


In [187]:
target2type = (
  gene_df[gene_df["gene_name"].isin(df["Target"])]
    .reset_index(drop=True)
    .rename(columns={"gene_name": "Target", "gene_type": "TargetType"})
)
target2type.head()

Unnamed: 0,Target,TargetType
0,LINC00115,lncRNA
1,HES5,protein_coding
2,PRDM16,protein_coding
3,TP73,protein_coding
4,TP73-AS1,transcribed_unitary_pseudogene


In [188]:
regulator2type = (
  gene_df[gene_df["gene_name"].isin(df["Regulator"])]
    .reset_index(drop=True)
    .rename(columns={"gene_name": "Regulator", "gene_type": "RegulatorType"})
)
regulator2type.head()

Unnamed: 0,Regulator,RegulatorType
0,LINC00115,lncRNA
1,LINC01128,lncRNA
2,LINC01342,lncRNA
3,PRKCZ-AS1,lncRNA
4,PRDM16-DT,lncRNA


In [189]:
target2type.to_csv("data/target2type.tsv", sep="\t", index=False)
regulator2type.to_csv("data/regulator2type.tsv", sep="\t", index=False)

### 🧘 Combine target2type and regulator2type relations

In [197]:
entity2type = pd.concat([
  regulator2type.rename(columns={"Regulator":"Entity", "RegulatorType":"EntityType"}),
  target2type.rename(columns={"Target":"Entity", "TargetType":"EntityType"}),
]).drop_duplicates(ignore_index=True)
entity2type.head()

Unnamed: 0,Entity,EntityType
0,LINC00115,lncRNA
1,LINC01128,lncRNA
2,LINC01342,lncRNA
3,PRKCZ-AS1,lncRNA
4,PRDM16-DT,lncRNA


In [198]:
entity2type.to_csv("data/entity2type.tsv", sep="\t", index=False)

### Create a NBFNet compatible entity mapping

In [204]:
entity2typeidx = entity2type.copy()
entity2typeidx["EntityType"] = entity2typeidx["EntityType"].astype("category")
entity2typeidx["EntityType"] = entity2typeidx["EntityType"].cat.codes
entity2typeidx.head()

Unnamed: 0,Entity,EntityType
0,LINC00115,1
1,LINC01128,1
2,LINC01342,1
3,PRKCZ-AS1,1
4,PRDM16-DT,1


In [207]:
entity2typeidx.to_csv("data/entity2typeidx.tsv", sep=" ", index=False, header=False)

### Hack entity types

In [210]:
entity2typeidx = pd.DataFrame({
  "Entity": pd.concat([df["Target"],df["Regulator"]]).drop_duplicates(ignore_index=True),
  "EntityType": 0,
})

In [211]:
entity2typeidx

Unnamed: 0,Entity,EntityType
0,miR-4429,0
1,CDKN1A,0
2,TGFB1,0
3,ZEB1,0
4,MIF,0
...,...,...
3903,LAMTOR5-AS1,0
3904,HSPA7,0
3905,PKMYT1AR,0
3906,RP5-857K21.7,0


In [214]:
entity2typeidx.to_csv("data/entity2typeidx.tsv", sep="\t", index=False, header=False)