## Inference ALgorithms Validation

#### Libraries

In [25]:
import pickle
import pandas as pd
import random
import time
import numpy as np
from algorithms.DIAMOnD import DIAMOnD
from algorithms.DiaBLE import DiaBLE
from algorithms.DIFFUSION import run_heat_diffusion
from algorithms.Funct_DIAMOnD import Funct_DIAMOnD
from algorithms.EnrichrQuery import Enrich_Main_List

#### Variables

In [13]:
# File Name to Load
Graph_of_PPI_LCC_file_name = "results/PPIGraph/Graph_of_PPI_LCC"
Seed_Genes_in_the_Interactome_file_name = "results/GDA/Seed_Genes_in_the_Interactome"

# File Name to Save/ Load
node_inferred_by_DIAMOnD_file = 'results/Inference/Node Inferred by DIAMOnD'
node_inferred_by_DiaBLE_file = 'results/Inference/Node Inferred by DiaBLE'
File_Name_of_Metrics_of_DiaBLE_inference = "results/Inference/Metrics_of_DiaBLE_inference"
File_Name_of_Metrics_of_DIAMOnD_inference = "results/Inference/Metrics_of_DIAMOnD_inference"
File_Name_of_Metrics_of_Diffusion_inference = "results/Inference/Metrics_of_Diffusion_inference"

# File_Name_of_Enrichments_Results = 'Enriched Genes Weight Seed Normalization.pickle'
# File_Name_of_Enrichments_Results = 'Enriched Genes Weight Self Normalization.pickle'
# Set_of_Seed_Enriched_Onthologies_file_name = 'Set_of_Seed_Enriched_Onthologies'

# Other Variables
k_parameter_of_splitting = 5;
Top_positions = [25, 50, 65, 100, 130, 180, 250]
# Top_positions = [25, 50, 65, 130, 252]
Diffusion_times_to_try = [0.002, 0.005, 0.01]

# Metrics Matrices Initialization
Precision_Matrix = [[None for _ in range(k_parameter_of_splitting)]
                    for _ in range(len(Top_positions))]
Recall_Matrix = [[None for _ in range(k_parameter_of_splitting)]
                 for _ in range(len(Top_positions))]
Founded_True_Positive = [[None for _ in range(k_parameter_of_splitting)]
                         for _ in range(len(Top_positions))]
F1_score_Matrix = [[None for _ in range(k_parameter_of_splitting)]
                   for _ in range(len(Top_positions))]
Metrics = [Founded_True_Positive, Precision_Matrix,
           Recall_Matrix, F1_score_Matrix]
Diffusion_Metrics_on_various_times = \
    [[[None for _ in range(k_parameter_of_splitting)]
      for _ in range(len(Top_positions))]
     for _ in range(len(Diffusion_times_to_try))]

Columns_of_the_Table = ["Top 25", "Top 50", "Top 65", "Top 130", "Top 252"]
Rows_of_the_Table = ["Precision", "Recall", "F1 Score"]

Columns_of_the_Table_Extended = ["Top 25", "Top 50", "Top 65",
                                 "Top 100", "Top 130", "Top 180", "Top 250"]

#### Load files

In [17]:

with open(Graph_of_PPI_LCC_file_name, "rb") as file:
    Graph_of_PPI_LCC = pickle.load(file)

with open(Seed_Genes_in_the_Interactome_file_name, "rb") as file:
    Seed_genes_Symbols = pickle.load(file)

#### Functions

In [20]:
def randomly_split_of_the_list(list_of_gene, n_split):
    random.seed(23)
    random.shuffle(list_of_gene)
    List_of_subarray = np.array_split(list_of_gene, n_split)
    List_of_sublists = [None for _ in range(n_split)]
    for i in range(n_split):
        List_of_sublists[i] = [str(element)
                               for element
                               in list(List_of_subarray[i])]
    return List_of_sublists


def Calculating_Metrics(nodes_added, Validation_list_of_seeds,
                        it_on_Top_position, it_on_validation):
    True_positive = len(set(nodes_added).intersection(Validation_list_of_seeds))
    Founded_True_Positive[it_on_Top_position][it_on_validation] = True_positive
    Precision_Matrix[it_on_Top_position][it_on_validation] = \
        True_positive / len(nodes_added)
    Recall_Matrix[it_on_Top_position][it_on_validation] = \
        True_positive / len(Validation_list_of_seeds)
    if Precision_Matrix[it_on_Top_position][it_on_validation] \
            + Recall_Matrix[it_on_Top_position][it_on_validation]:
        F1_score_Matrix[it_on_Top_position][it_on_validation] = \
            2 * (Precision_Matrix[it_on_Top_position][it_on_validation] \
                 * Recall_Matrix[it_on_Top_position][it_on_validation]) \
            / (Precision_Matrix[it_on_Top_position][it_on_validation] \
               + Recall_Matrix[it_on_Top_position][it_on_validation])
    else:
        F1_score_Matrix[it_on_Top_position][it_on_validation] = None

    Metrics = [Founded_True_Positive, Precision_Matrix,
               Recall_Matrix, F1_score_Matrix]
    return Metrics


def Implementing_Inference_Algorithms(Choosed_algorithm,
                                      Graph_of_PPI_LCC, Seed_genes_Symbols, Top_positions,
                                      outfile=node_inferred_by_DIAMOnD_file, Diffusion_time=0.005):
    List_of_sublists_of_seeds = randomly_split_of_the_list(
        list(Seed_genes_Symbols), k_parameter_of_splitting)
    for it_on_validation in range(k_parameter_of_splitting):
        List_of_sublists_of_seeds_copy = List_of_sublists_of_seeds.copy()
        Validation_list_of_seeds = List_of_sublists_of_seeds[it_on_validation]
        List_of_sublists_of_seeds_copy.remove(List_of_sublists_of_seeds[it_on_validation])
        Training_list_of_seeds = [Gene
                                  for sublist in List_of_sublists_of_seeds_copy
                                  for Gene in sublist]
        match Choosed_algorithm:
            case 'DIAMOnD':
                for it_on_Top_position in range(len(Top_positions)):
                    list_of_nodes_added = DIAMOnD(Graph_of_PPI_LCC, Training_list_of_seeds,
                                                  Top_positions[it_on_Top_position],
                                                  alpha=1,
                                                  outfile=node_inferred_by_DIAMOnD_file)
                    nodes_added = [Generic_tuple[0] for Generic_tuple
                                   in list_of_nodes_added]
                    Metrics = Calculating_Metrics(nodes_added,
                                                  Validation_list_of_seeds,
                                                  it_on_Top_position, it_on_validation)

            case 'DiaBLE':
                for it_on_Top_position in range(len(Top_positions)):
                    list_of_nodes_added = DiaBLE(Graph_of_PPI_LCC, Training_list_of_seeds,
                                                 Top_positions[it_on_Top_position],
                                                 alpha=1,
                                                 outfile=node_inferred_by_DIAMOnD_file)
                    nodes_added = [Generic_tuple[0] for Generic_tuple
                                   in list_of_nodes_added]
                    Metrics = Calculating_Metrics(nodes_added,
                                                  Validation_list_of_seeds,
                                                  it_on_Top_position, it_on_validation)

            case 'Diffusion':
                for it_on_Top_position in range(len(Top_positions)):
                    nodes_added = run_heat_diffusion(
                        Graph_of_PPI_LCC, Training_list_of_seeds, diffusion_time=Diffusion_time,
                        n_positions=Top_positions[it_on_Top_position])
                    Metrics = Calculating_Metrics(nodes_added,
                                                  Validation_list_of_seeds,
                                                  it_on_Top_position, it_on_validation)

            case 'Funct_DIAMOnD':
                Set_of_Seed_Enriched_Onthologies = Enrich_Main_List(Training_list_of_seeds)
                for it_on_Top_position in range(len(Top_positions)):
                    list_of_nodes_added = Funct_DIAMOnD(Graph_of_PPI_LCC,
                                                        Training_list_of_seeds,
                                                        Top_positions[it_on_Top_position],
                                                        Set_of_Seed_Enriched_Onthologies,
                                                        alpha=1,
                                                        File_Name_of_Enrichments_Results= \
                                                            File_Name_of_Enrichments_Results[it_on_validation])
                    nodes_added = [Generic_tuple[0] for Generic_tuple
                                   in list_of_nodes_added]
                    Metrics = Calculating_Metrics(nodes_added,
                                                  Validation_list_of_seeds,
                                                  it_on_Top_position, it_on_validation)

            case _:
                print("Invalid Inference Method. \n" +
                      "Please choose one from the following:\n" +
                      "'DIAMOND', 'DiaBLE', 'Diffusion' \n")
                Metrics = []
    return Metrics


def Handle_ZeroDivisionError(array):
    means = []
    stds = []
    for matrix in array:
        matrix_means = []
        matrix_stds = []
        for row in matrix:
            try:
                mean = np.nanmean(row)
                std = np.nanstd(row)
            except ZeroDivisionError:
                mean = np.nan
                std = np.nan
            matrix_means.append(mean)
            matrix_stds.append(std)
        means.append(matrix_means)
        stds.append(matrix_stds)
    return means, stds


def Table_Visualization_of_Metrics(Metrics, Columns_of_the_Table, Rows_of_the_Table):
    Metrics_array = np.array(Metrics)
    Metrics_array = Metrics_array.astype(np.float64)
    try:
        Mean_of_the_Metrics = \
            np.nanmean(Metrics_array, axis=2)
        Standard_Deviation_of_the_Metrics = \
            np.nanstd(Metrics_array, axis=2)
    except ZeroDivisionError:
        Mean_of_the_Metrics, Standard_Deviation_of_the_Metrics = \
            Handle_ZeroDivisionError(Metrics_array)
    Mean_of_the_Metrics = \
        np.round(Mean_of_the_Metrics, 3)
    Standard_Deviation_of_the_Metrics = \
        np.round(Standard_Deviation_of_the_Metrics, 3)
    List_of_PlusMinus_Mean_and_Std = \
        [
            [str(Mean_of_the_Metrics[i, j]) + ' ± ' +
             str(Standard_Deviation_of_the_Metrics[i, j])
             for j in range(Standard_Deviation_of_the_Metrics.shape[1])
             ]
            for i in range(Mean_of_the_Metrics.shape[0])
        ]

    Table_Visualization = pd.DataFrame(List_of_PlusMinus_Mean_and_Std,
                                       columns=Columns_of_the_Table,
                                       index=Rows_of_the_Table)
    return Table_Visualization

#### Implementation

*Uncomment sections to reproduce the results*

In [None]:
'''Uncomment this Section to reproduce the DIFFUSION ALGORITHM results'''
# # Creation of the group of Matrices of Metrics
# Choosed_algorithm = 'Diffusion'
# for it_on_diffusion_times in range(len(Diffusion_times_to_try)):
#     Diffusion_Metrics = \
#         Implementing_Inference_Algorithms(Choosed_algorithm, Graph_of_PPI_LCC,
#                                           Seed_genes_Symbols, Top_positions,
#                                           Diffusion_time = \
#                                               Diffusion_times_to_try[it_on_diffusion_times])
#     Diffusion_Metrics_on_various_times[it_on_diffusion_times] = Diffusion_Metrics

# # Saving of the group of Matrices of Matrix
# with open(File_Name_of_Metrics_of_Diffusion_inference, 'wb') as file:
#     pickle.dump(Diffusion_Metrics_on_various_times, file)
''''''

'''Uncomment this Section to reproduce the DIABLE ALGORITHM results'''
# # Creation of the Matrices of the Metrics
# Choosed_algorithm = 'DiaBLE'
# File_Name_of_Metrics_of_DiaBLE_inference = "Metrics_of_DiaBLE_inference"

# DiaBLE_Metrics = Implementing_Inference_Algorithms(Choosed_algorithm, Graph_of_PPI_LCC,
#                                   Seed_genes_Symbols, Top_positions)

# # Saving the Matrices of the Metrics
# with open(File_Name_of_Metrics_of_DiaBLE_inference, 'wb') as file:
#     pickle.dump(DiaBLE_Metrics, file)
''''''

'''Uncomment this Section to reproduce the DIAMOND ALGORITHM results'''
# # Creation of the Matrices of the Metrics
# Choosed_algorithm = 'DIAMOnD'
# File_Name_of_Metrics_of_DIAMOnD_inference = "Metrics_of_DiaBLE_inference"

# DIAMOnD_Metrics = Implementing_Inference_Algorithms(Choosed_algorithm, Graph_of_PPI_LCC,
#                                   Seed_genes_Symbols, Top_positions)

# # Saving the Matrices of the Metrics
# with open(File_Name_of_Metrics_of_DIAMOnD_inference, 'wb') as file:
#     pickle.dump(DIAMOnD_Metrics, file)
''''''

In [21]:


# Downloading of the group of Matrices of Metrics
with open(File_Name_of_Metrics_of_Diffusion_inference, 'rb') as file:
    Diffusion_Metrics_on_various_times_download = pickle.load(file)

Table_Visualization_of_Diffusion_Metrics_iterated = \
    [None for _ in range(len(Diffusion_times_to_try))]

for it_on_Visualizations in range(len(Diffusion_times_to_try)):
    Table_Visualization_of_Diffusion_Metrics_iterated[it_on_Visualizations] = \
        Table_Visualization_of_Metrics(
            Diffusion_Metrics_on_various_times_download[it_on_Visualizations][1:],
            Columns_of_the_Table_Extended, Rows_of_the_Table)
    print(Table_Visualization_of_Diffusion_Metrics_iterated[it_on_Visualizations])
del it_on_Visualizations

                 Top 25         Top 50         Top 65        Top 100  \
Precision  0.016 ± 0.02  0.016 ± 0.008  0.022 ± 0.016  0.018 ± 0.006   
Recall     0.008 ± 0.01  0.016 ± 0.008   0.028 ± 0.02  0.048 ± 0.016   
F1 Score    0.027 ± 0.0     0.02 ± 0.0   0.03 ± 0.014  0.027 ± 0.009   

                 Top 130        Top 180        Top 250  
Precision  0.021 ± 0.007  0.036 ± 0.014   0.029 ± 0.01  
Recall     0.103 ± 0.033  0.127 ± 0.048  0.143 ± 0.048  
F1 Score   0.034 ± 0.011  0.056 ± 0.022  0.048 ± 0.016  
                 Top 25         Top 50         Top 65        Top 100  \
Precision  0.016 ± 0.02  0.016 ± 0.008  0.022 ± 0.016  0.018 ± 0.006   
Recall     0.008 ± 0.01  0.016 ± 0.008   0.028 ± 0.02  0.048 ± 0.016   
F1 Score    0.027 ± 0.0     0.02 ± 0.0   0.03 ± 0.014  0.027 ± 0.009   

                 Top 130        Top 180        Top 250  
Precision  0.021 ± 0.007  0.036 ± 0.014   0.029 ± 0.01  
Recall     0.103 ± 0.033  0.127 ± 0.048  0.143 ± 0.048  
F1 Score   0.034 ± 0.01

In [None]:
# # Creation of the Matrices of the Metrics
Choosed_algorithm = 'Funct_DIAMOnD'

File_Name_of_Enrichments_Results = ['Enriched_Genes_Weight_First_Validation_Step',
                                    'Enriched_Genes_Weight_Second_Validation_Step',
                                    'Enriched_Genes_Weight_Third_Validation_Step',
                                    'Enriched_Genes_Weight_Fourth_Validation_Step',
                                    'Enriched_Genes_Weight_Fifth_Validation_Step']

start_time = time.time()
Funct_DIAMOnD_Metrics = Implementing_Inference_Algorithms(Choosed_algorithm, Graph_of_PPI_LCC,
                                                          Seed_genes_Symbols,
                                                          Top_positions)
end_time = time.time()
Computational_Time = (end_time - start_time) / 60


Table_Visualization_of_Funct_DIAMOnD = \
    Table_Visualization_of_Metrics(Funct_DIAMOnD_Metrics[1:],
                                   Columns_of_the_Table_Extended,
                                   Rows_of_the_Table)

with open(File_Name_of_Enrichments_Results[1], "rb") as file:
    Enriched_Genes_Weight_First_Validation_Step = pickle.load(file)