In [None]:
import numpy as np
import pandas as pd

from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from utils.utils import add_key_value_to_mat

In [None]:
dataset_name = 'Ydataset'
disease_emb_file = f'data/{dataset_name}/{dataset_name}_disease_embedding.csv'
drug_emb_file = f'data/{dataset_name}/{dataset_name}_drug_embedding.csv'

In [None]:
disease_emb_df = pd.read_csv(disease_emb_file)
disease_emb_df.head()

In [None]:
drug_emb_df = pd.read_csv(drug_emb_file)
drug_emb_df.head()

In [None]:
disease_emb_df.shape, drug_emb_df.shape

In [None]:
def compute_cosine_similarity_exclude_zeros(emb_df):
    """
    Compute the cosine similarity of an embedding matrix, excluding all-zero rows, and handle diagonal values.

    Parameters:
    - emb_df: pd.DataFrame, an entity embedding matrix (n, d), where each row represents a d-dimensional embedding vector of an entity

    Returns:
    - full_similarity_matrix: np.ndarray, a similarity matrix of shape (n, n) with values in the range [0, 1]
    """
    # 1. Extract the embedding matrix
    emb_matrix = emb_df.values  # Convert DataFrame to NumPy array

    # 2. Check for all-zero rows
    zero_rows = np.all(emb_matrix == 0, axis=1)  # Shape: (n,)

    # 3. Filter out all-zero rows, using only non-zero rows for similarity calculation
    non_zero_matrix = emb_matrix[~zero_rows]

    # 4. If the number of non-zero vectors is less than 2, return a diagonal matrix with ones
    if non_zero_matrix.shape[0] < 2:
        full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))
        np.fill_diagonal(full_similarity_matrix, 1)
        return full_similarity_matrix

    # 5. Compute the cosine similarity matrix for the filtered non-zero rows
    similarity = cosine_similarity(non_zero_matrix)  # Compute similarity only between non-zero rows

    # 6. Scale cosine similarity values to the range [0, 1]
    similarity_adjusted = MinMaxScaler().fit_transform(similarity)

    # 7. Construct the full similarity matrix
    full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))  # Initialize an all-zero matrix

    # 8. Use boolean indexing to fill the similarity values back into the full matrix for non-zero rows and columns
    non_zero_indices = np.where(~zero_rows)[0]  # Extract indices of non-zero rows
    for i, row_idx in enumerate(non_zero_indices):
        for j, col_idx in enumerate(non_zero_indices):
            full_similarity_matrix[row_idx, col_idx] = similarity_adjusted[i, j]

    # 9. Set diagonal values to 1 for all rows (including zero rows)
    np.fill_diagonal(full_similarity_matrix, 1)

    return full_similarity_matrix

In [None]:
def compute_spearman_similarity_exclude_zeros(emb_df):
    """
    Compute the Spearman correlation of an embedding matrix, excluding all-zero rows, and handle diagonal values.

    Parameters:
    - emb_df: pd.DataFrame, an entity embedding matrix (n, d), where each row represents a d-dimensional embedding vector of an entity

    Returns:
    - full_similarity_matrix: np.ndarray, a similarity matrix of shape (n, n) with values in the range [-1, 1]
    """
    # 1. Extract the embedding matrix
    emb_matrix = emb_df.values  # Convert DataFrame to NumPy array

    # 2. Check for all-zero rows
    zero_rows = np.all(emb_matrix == 0, axis=1)  # Shape: (n,)

    # 3. Filter out all-zero rows, using only non-zero rows for similarity calculation
    non_zero_matrix = emb_matrix[~zero_rows]

    # 4. If the number of non-zero vectors is less than 2, return a diagonal matrix with ones
    if non_zero_matrix.shape[0] < 2:
        full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))
        np.fill_diagonal(full_similarity_matrix, 1)
        return full_similarity_matrix

    # 5. Compute the Spearman correlation matrix for the filtered non-zero rows
    similarity = np.zeros((non_zero_matrix.shape[0], non_zero_matrix.shape[0]))
    for i in range(non_zero_matrix.shape[0]):
        for j in range(non_zero_matrix.shape[0]):
            if i != j:
                corr, _ = spearmanr(non_zero_matrix[i], non_zero_matrix[j])
                similarity[i, j] = corr

    # 6. Construct the full similarity matrix
    full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))  # Initialize an all-zero matrix

    # 7. Use boolean indexing to fill the similarity values back into the full matrix for non-zero rows and columns
    non_zero_indices = np.where(~zero_rows)[0]  # Extract indices of non-zero rows
    for i, row_idx in enumerate(non_zero_indices):
        for j, col_idx in enumerate(non_zero_indices):
            full_similarity_matrix[row_idx, col_idx] = similarity[i, j]

    # 8. Set diagonal values to 1 for all rows (including zero rows)
    np.fill_diagonal(full_similarity_matrix, 1)

    return full_similarity_matrix

In [None]:
drug_sim = compute_cosine_similarity_exclude_zeros(drug_emb_df)
disease_sim = compute_cosine_similarity_exclude_zeros(disease_emb_df)

In [None]:
drug_sim.shape, disease_sim.shape

In [None]:
pd.DataFrame(drug_sim).head(10)

In [None]:
pd.DataFrame(disease_sim).head()

In [None]:
add_key_value_to_mat(f'data/{dataset_name}/{dataset_name}.mat', f'data/{dataset_name}/{dataset_name}.mat', 'drug_LlmS', drug_sim)
add_key_value_to_mat(f'data/{dataset_name}/{dataset_name}.mat', f'data/{dataset_name}/{dataset_name}.mat', 'disease_LlmS', disease_sim)