In [1]:
import pandas as pd
import re
from lib.NA_sequence_utilities import *

Families in miRbase ignore the origin from the 5p and 3p arm, which makes them useless to predict crosstalk.
Here, we add this info for better crosstalk prediction.

In [3]:
# get mirbase data
mirbase_df = pd.read_csv('../input_data/mirbase.csv', index_col=0)

# fix nan values in mirbase
mirbase_df["family"] = mirbase_df["family"].astype(str)
mirbase_df["family"] = mirbase_df.apply(lambda x: "mir-" + x.name.split("-")[2] if x["family"]=="nan" else x["family"], axis=1)

for index, row in mirbase_df.iterrows():
    if not index.split('-')[-1] in ["5p", "3p"]:
        family_mirnas = mirbase_df.loc[mirbase_df["family"] == row["family"], :].index
        seed_mirna = row["sequence"][1:8]
        for mirna in family_mirnas:
            if mirbase_df.loc[mirna, "sequence"][1:8] == seed_mirna:
                if mirna.split('-')[-1] in ["5p", "3p"]:
                    mirbase_df.loc[index, "family_extended"] = row["family"] + "-" + mirna.split('-')[-1]
        if not isinstance(mirbase_df.loc[index, "family_extended"], str):
            mirbase_df.loc[index, "family_extended"] = row["family"]
        # print(index)
        # print(mirbase_df.loc[index,"family_extended"])
        # print(mirbase_df.loc[index,"sequence"])
        # print(mirbase_df.loc[family_mirnas,"sequence"])
        # break
    else:
        mirbase_df.loc[index, "family_extended"] = row["family"] + "-" + index.split('-')[-1]

To insert sequences, we need to normalize the length of miRNA targets to 21 nt. Add the normalized target to mirbase:

In [5]:
def fill_in_sequence(sequence):
    """Fill in the sequence with Ts to make it 21 nts long.
    Avoid introduction of ATGs."""
    if len(sequence) > 20:
        sequence = sequence
    elif len(sequence) == 20 and sequence[-1] == "A":
        sequence = sequence + "A"
    else:
        sequence = sequence + "T"* (21-len(sequence))
    return sequence

# truncate all entries in "sequence" to 20 bases
mirbase_df["sequence"] = mirbase_df["sequence"].str[:21]
mirbase_df["sequence"] = mirbase_df["sequence"].apply(fill_in_sequence)

# convert to DNA
mirbase_df["sequence"] = mirbase_df["sequence"].apply(lambda x: x.replace("U", "T"))

# add the target sequence
mirbase_df["target"] = mirbase_df["sequence"].apply(lambda x: reverse_complement(x, alph="DNA"))

# here, we also take care of ATGs in case we want to integrate miRNA sites into the 5' UTR
# get the position of the first ATG
mirbase_df["ATG_pos"] = mirbase_df["target"].apply(lambda x: [match.start() for match in re.finditer('ATG', x)])
# get the position of the first ATG modulo 3
mirbase_df["ATG_pos_mod3"] = mirbase_df["ATG_pos"].apply(lambda x: [entry%3 for entry in x])
# fill the empty lists with 0
mirbase_df['ATG_pos_mod3'] = mirbase_df['ATG_pos_mod3'].apply(lambda x: x if x else [0])
# get the ATG count
mirbase_df["ATG_count"] = mirbase_df["target"].apply(lambda x: x.count("ATG"))

In [None]:
# save to csv
mirbase_df.to_csv("../input_data/mirbase_with_families_and_targets.csv")