In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import itertools
import os
import pickle
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
from library2_utils.transfer_functions import transfer_function
from typing import Union
import ast

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

In [2]:
# add the mirna target sequence 
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)

# working on the RNA level
mirbase["sequence_norm"] = mirbase["sequence_norm"].str.replace("T", "U")
mirbase["target"] = mirbase["target"].str.replace("T", "U")

# find identical miRNAs

In [8]:
# First, we merge identical miRNAs
mirbase["sequence_norm_short"] = mirbase["sequence_norm"].str[:18]

# find out which miRNAs are identical
mirna_groups = mirbase.groupby("sequence_norm_short").groups
mirna_groups = [list(v) for _, v in mirna_groups.items() if len(v) > 1]

In [11]:
# write this to a text file
with open("../outputs/5_mutations/mirna_merge_groups.txt", "w") as f:
    for group in mirna_groups:
        f.write(str(group).replace("'","").replace("[","").replace("]","") + "\n")

# find crosstalking miRNAs

In [12]:
output_folder = f"../outputs/5_mutations"

# load full_crosstalk_dict
with open(f"{output_folder}/5.5_full_crosstalk_dict.pkl", "rb") as f:
    full_crosstalk_dict = pickle.load(f)

In [33]:
crosstalk_filter_df = pd.DataFrame(columns=["crosstalking miRNAs"], index=full_crosstalk_dict.keys())
crosstalk_filter_df.loc[:, :] = False

for key in full_crosstalk_dict.keys():    
    df = full_crosstalk_dict[key].copy()
    df = df[df.index != key]
    
    # check for total less than 5 mutations
    df = df[(df["no_total_impact"]) < 5]
    
    # check for high impact mutations
    df = df[df["no_high_impact"] < 2]
    
    # check for mid_impact mutations
    df = df[(df["no_mid_impact"]+df["no_high_impact"]) < 4]
    
    # check for all mutations
    df = df[(df["no_low_impact"]+df["no_mid_impact"]+df["no_high_impact"]) < 5]
    
    crosstalk_filter_df.loc[key, "crosstalking miRNAs"] = str(list(df.index))[1:-1].replace("'","")

In [37]:
# drop all empty rows
crosstalk_filter_df = crosstalk_filter_df[crosstalk_filter_df["crosstalking miRNAs"] != ""]

In [39]:
crosstalk_filter_df.to_csv(f"{output_folder}/5b_crosstalk_filter_df_str.csv")