In [1]:
#miRNA enrichment analysis per each supercluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
plt.rcParams['svg.fonttype'] = 'none'

# load the edges database 
# select the threshold for the enrichment analysis
# only highly negatively correlated gene miRNAs 

mirna_edges = pd.read_csv("../files/databases/mirna_edges.csv") # add here the path to the mirna-edges file
mirna_clusters = pd.read_csv("../files/Supercluster_excel/miR_superclustering.csv")
mirna_corr = mirna_edges[(mirna_edges["correlation"] < -0.7) & (mirna_edges["score"] > 100)]

In [2]:
mirna_edges.tail() # check the table

Unnamed: 0.1,Unnamed: 0,gene_name,mirna,correlation,p_value,cluster_gene,supercluster_gene,cluster_mirna,supercluster_mirna,validated,diana_score,Row.names,baseMean,scaled_expression,ranking,score
1232515,29185698,ZNF488,hsa-miR-4739,-0.224262,0.370986,brown4,neural progenitor_late,purple,3,1.0,0.0,hsa-miR-4739,0.252322,4.554972e-07,0.07,1.569832
1232516,29185739,ZNF488,hsa-miR-2276-5p,0.37683,0.123207,brown4,neural progenitor_late,salmon,2,1.0,0.0,hsa-miR-2276-5p,2.19984,3.971205e-06,0.37,13.942711
1232517,29185772,ZNF488,hsa-miR-642a-5p,0.608175,0.007409,brown4,neural progenitor_late,salmon,2,1.0,0.0,hsa-miR-642a-5p,16.898954,3.050641e-05,0.65,39.531354
1232518,29185781,ZNF488,hsa-miR-7106-3p,0.14098,0.576853,brown4,neural progenitor_late,salmon,2,1.0,0.0,hsa-miR-7106-3p,0.265114,4.785901e-07,0.08,1.127841
1232519,29185972,ZNF488,hsa-miR-18a-3p,0.073747,0.771192,brown4,neural progenitor_late,lightcyan,3,1.0,0.0,hsa-miR-18a-3p,758.3802,0.001369047,0.91,6.710998


In [3]:
# here the Top5 mirnas are determined
superclusters = mirna_clusters[[ "cluster", "hierachical_cluster"]].drop_duplicates()  # cluster from expression dataset
mirna_expression = mirna_corr[["mirna", "baseMean", "cluster_mirna"]].drop_duplicates()  # cluster from miRNA_edges dataset
mirna_top5 = pd.DataFrame()  # determine the top5

#determine the top expresssed miRNAs per cluster assignment
for i in set( mirna_expression["cluster_mirna"]):  # loop to determine top 5 by individiually selecting the mirna modules
    mirna_exp = mirna_expression[mirna_expression["cluster_mirna"] == i]
    mirna_exp = mirna_exp.sort_values("baseMean", ascending=False).iloc[:5, :]
    mirna_top5 = mirna_top5.append(mirna_exp)
#mirna_top5.to_csv("Top_miRNAs_module.csv")
mirna_top5

Unnamed: 0,mirna,baseMean,cluster_mirna
491759,hsa-miR-21-5p,223585.097853,brown
330794,hsa-miR-100-5p,45908.620502,brown
221718,hsa-miR-152-3p,8129.452567,brown
201449,hsa-let-7e-5p,6748.960598,brown
223805,hsa-miR-501-3p,3821.249948,brown
...,...,...,...
3885,hsa-miR-25-3p,72951.837511,red
1246,hsa-miR-92a-3p,43191.537732,red
1170900,hsa-miR-151a-3p,20841.078095,red
176797,hsa-miR-130b-5p,9263.201970,red


In [4]:
def mirna_analysis_go_div(diana_df, species):
    # diana_dictionary == output von diana_only mirnas bzw. eigener DataFrame,
    # col_name is der Name der Column, die die Gene beinhaltet
    # sources = sind die sources die man gerne erhalten möchte als Liste bspw. ["GO"] wären alle Termen
    # ["GO:BP"] wäre einzig und allein Biological Process
    # KEGG ist ein optionaler parameter der angegeben kann wenn man auch an KEGG Pathways interessiert ist
    go_profiler = {}
    df_go_end = pd.DataFrame()
    protein_list = diana_df
    print(len(protein_list))

    r = requests.post(url='https://biit.cs.ut.ee/gprofiler/api/gost/profile/',
                      json={
                          'organism': species,
                          'query': protein_list,
                          'sources': ["GO:BP", "GO:MF", "GO:CC", "KEGG"],
                      },
                      headers={'User-Agent': 'FullPythonRequest'})
    try:
        data = r.json()["result"]
        parents_list = []
        go_list = []
        p_value = []
        desc_value = []
        source_list = []
        for n in data:
            go_list.append(n["native"])
            for t in n["parents"]:
                parents_list.append(t)
        end_list = [i for i in go_list if i not in parents_list]
        for m in data:
            if m["native"] in end_list:
                p_value.append(m["p_value"])
        for l in data:
            if l["native"] in end_list:
                desc_value.append(l["name"])
        for l in data:
            if l["native"] in end_list:
                source_list.append(l["source"])
        go_profiler.update({
            "p-value": p_value,
            "go-terms": end_list,
            "description": desc_value,
            "source": source_list
        })
        df_go = pd.DataFrame(
            columns=["go-terms", "description", "source", "p-value"])
        df_go["go-terms"] = list(end_list)
        df_go["description"] = desc_value
        df_go["source"] = source_list
        df_go["p-value"] = p_value

    except KeyError as e:
        pass

    #df_go_end.to_excel("signed_go_enrichment_analysis_filtered.xlsx")
    return (df_go)

In [13]:
mirna_cenr = pd.DataFrame()
top_mirna_enr = pd.DataFrame()
genes_mirna = pd.DataFrame()
for mirna in set(mirna_clusters["cluster"]):
    genes_dataframe = pd.DataFrame()  # new dataframe to get genes
    mirna_liste = mirna_clusters[mirna_clusters["cluster"] == mirna]["Row.names"]
    # slice the dataframe for each cluster
    genes_liste = list(set(mirna_corr[mirna_corr["mirna"].isin(mirna_liste)]["gene_name"].tolist()))  # get the set of genes -> avoid duplicates

    # get the genes per cluster
    genes_dataframe["genes"] = genes_liste
    genes_dataframe["cluster"] = genes_dataframe.shape[0] * [str(mirna)]  
    # assign the queried cluster to the new table
    # get the reverse enrichments for each cluster
    genes_enrichment = mirna_analysis_go_div(genes_liste, "hsapiens")
    genes_enrichment["cluster"] = genes_enrichment.shape[0] * [str(mirna)] 
    # assign the queried cluster to the new table
    genes_enrichment_top = genes_enrichment.sort_values("p-value").iloc[:10, :]
    # append the tables with the new data
    top_mirna_enr = top_mirna_enr.append(genes_enrichment_top)
    mirna_cenr = mirna_cenr.append(genes_enrichment)
    genes_mirna = genes_mirna.append(genes_dataframe)

496
576
440
136
44
289
285
444
740
560
238
467
188
45
39
110
366
