In [18]:
import pandas as pd
import pickle
from library2_utils.NA_sequence_utilities import reverse_complement

### This notebook does some preprocessing on mirbase tables

In [None]:
def fill_in_sequence(sequence):
    """Fill in the sequence with Ts to make it 21 nts long.
    Avoid introduction of ATGs."""
    if len(sequence) > 20:
        sequence = sequence
    elif len(sequence) == 20 and sequence[-1] == "A":
        sequence = sequence + "A"
    else:
        sequence = sequence + "T"* (21-len(sequence))
    return sequence

In [3]:
mirbase = pd.read_csv("../microrna_data/mirbase_original.csv", index_col=0)

# convert sequence to a DNA sequence
mirbase["sequence"] = mirbase["sequence"].str.replace("U", "T")

# create a normalized sequence column with a length of 21 nt
mirbase["sequence_norm"] = mirbase["sequence"].str[:21]
mirbase["sequence_norm"] = mirbase["sequence_norm"].apply(fill_in_sequence)

# rename the original sequence column
mirbase = mirbase.rename(columns={"sequence": "sequence_orig"})

In [6]:
# create a target column based on the normalized sequence
mirbase["target"] = mirbase["sequence_norm"].apply(lambda x: reverse_complement(x, "DNA"))

In [7]:
# get the GC content of the original sequence
mirbase["GC_content"] = (mirbase["sequence_orig"].str.count("G") + mirbase["sequence_orig"].str.count("C")) / mirbase["sequence_orig"].str.len()

In [None]:
# add an extra columns with families divided in to 3p and 5p
# fix nan values in mirbase by using the microRNA name
mirbase["family"] = mirbase["family"].astype(str)
mirbase["family"] = mirbase.apply(lambda x: "mir-" + x.name.split("-")[2] if x["family"]=="nan" else x["family"], axis=1)

for index, row in mirbase.iterrows():
    # if the miRNA name does not contain "5p" or "3p", check if the seed matches the 5p or 3p entries of other miRNAs in the family
    if not index.split('-')[-1] in ["5p", "3p"]:
        family_mirnas = mirbase.loc[mirbase["family"] == row["family"], :].index
        seed_mirna = row["sequence_orig"][1:8]
        for mirna in family_mirnas:
            if mirbase.loc[mirna, "sequence_orig"][1:8] == seed_mirna:
                if mirna.split('-')[-1] in ["5p", "3p"]:
                    mirbase.loc[index, "family_extended"] = row["family"] + "-" + mirna.split('-')[-1]
        if not isinstance(mirbase.loc[index, "family_extended"], str):
            mirbase.loc[index, "family_extended"] = row["family"]
    else:
        mirbase.loc[index, "family_extended"] = row["family"] + "-" + index.split('-')[-1]

In [9]:
# save the result to a file
mirbase.to_csv("../microrna_data/mirbase_extended.csv")

# Get all likely real miRNAs (high confidence in miRBase or in MirGeneDB)

In [10]:
# load mirgeneDB
mirgenedb = pd.read_csv("../microrna_data/mirgenedb.csv", index_col=0)

# get high confidence miRNAs in mirgenedb
mirbase_high_conf = mirbase[mirbase["confidence"] == "high"]

In [11]:
# get relevant miRNAs from mirgenedb
mirgenedb_mirnas = list(mirgenedb["5p accession"].unique()) + list(mirgenedb["3p accession"].unique())
mirgenedb_mirnas = [mirna for mirna in mirgenedb_mirnas if mirna != "None"]
mirgenedb_mirnas = mirbase[mirbase["MIMAT"].isin(mirgenedb_mirnas)]

In [17]:
real_mirnas = list(mirgenedb_mirnas.index.union(mirbase_high_conf.index))

In [19]:
# save the result to a file with pickle
with open("../microrna_data/likely_real_mirnas.pkl", "wb") as f:
    pickle.dump(real_mirnas, f)