In [28]:
import numpy as np
import pandas as pd
from utils.utils import load_dataset

In [41]:
# Define a function to calculate missing data (0 values) in a similarity matrix
def calculate_missing_data(matrix):
    """
    Calculate the number of subjects with missing data in a similarity matrix.
    Missing data is defined as rows/columns where all off-diagonal values are zero.

    Parameters:
    matrix (numpy.ndarray): A square similarity matrix.

    Returns:
    tuple: (number of subjects with missing data, percentage of missing data)
    """
    if not isinstance(matrix, np.ndarray):
        raise ValueError("Input must be a numpy ndarray.")

    if matrix.shape[0] != matrix.shape[1]:
        raise ValueError("Matrix must be square.")

    # Ensure diagonal values are treated as 1
    np.fill_diagonal(matrix, 1)

    # Identify rows (or columns) where all off-diagonal values are zero
    missing_count = 0
    for i in range(matrix.shape[0]):
        # Exclude diagonal and check if all off-diagonal values are zero
        if np.all(matrix[i, :i] == 0) and np.all(matrix[i, i+1:] == 0):
            missing_count += 1

    # Calculate the percentage of missing subjects
    total_subjects = matrix.shape[0]
    missing_percentage = round((missing_count / total_subjects) * 100, 2)

    return missing_count, missing_percentage

In [49]:
# Create a function to process each dataset
def process_datasets(datasets, load_dataset_function):
    results = []

    for dataset_name in datasets:
        (_, _, Wrd, Wrr_eight, _, _, _, _, _, _, _, Wdd_three, _, _, _, _, _, _, _, _, _, _, _, _, _) = load_dataset_function(dataset_name, embedding_type='llm')

        # Unpack drug and disease similarity matrices
        drug_ChemS, drug_AtcS, drug_SideS, drug_DDIS, drug_TargetS, drug_GepS, drug_KgS, drug_LlmS = Wrr_eight
        disease_PhS, disease_DoS, disease_LlmS = Wdd_three

        # Drug and disease counts
        drug_count = Wrd.shape[0]
        disease_count = Wrd.shape[1]

        # Drug-disease associations
        drug_disease_associations = np.sum(Wrd != 0)

        # Prepare matrices and labels
        matrices = {
            'Drug_ChemS': drug_ChemS,
            'Drug_AtcS': drug_AtcS,
            'Drug_SideS': drug_SideS,
            'Drug_DDIS': drug_DDIS,
            'Drug_TargetS': drug_TargetS,
            'Drug_GepS': drug_GepS,
            'Drug_KgS': drug_KgS,
            'Drug_LlmS': drug_LlmS,
            'Disease_PhS': disease_PhS,
            'Disease_DoS': disease_DoS,
            'Disease_LlmS': disease_LlmS
        }

        # Calculate missing data for each matrix
        for matrix_name, matrix in matrices.items():
            zero_count, missing_percentage = calculate_missing_data(matrix)
            results.append({
                'Dataset': dataset_name,
                'Matrix': matrix_name,
                'Zero_Count': zero_count,
                'Missing_Percentage': missing_percentage,
                'Drug_Count': drug_count,
                'Disease_Count': disease_count,
                'Drug_Disease_Associations': drug_disease_associations
            })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    return results_df

In [50]:
datasets = ['Fdataset', 'Cdataset', 'Ydataset', 'iDrug']

In [51]:
results_df = process_datasets(datasets, load_dataset)
results_df

Unnamed: 0,Dataset,Matrix,Zero_Count,Missing_Percentage,Drug_Count,Disease_Count,Drug_Disease_Associations
0,Fdataset,Drug_ChemS,0,0.0,593,313,1933
1,Fdataset,Drug_AtcS,25,4.22,593,313,1933
2,Fdataset,Drug_SideS,72,12.14,593,313,1933
3,Fdataset,Drug_DDIS,3,0.51,593,313,1933
4,Fdataset,Drug_TargetS,33,5.56,593,313,1933
5,Fdataset,Drug_GepS,197,33.22,593,313,1933
6,Fdataset,Drug_KgS,1,0.17,593,313,1933
7,Fdataset,Drug_LlmS,0,0.0,593,313,1933
8,Fdataset,Disease_PhS,0,0.0,593,313,1933
9,Fdataset,Disease_DoS,171,54.63,593,313,1933


In [52]:
results_df.to_excel('data/other/Data Distribution.xlsx', index=False)