In [1]:
import numpy as np
import pandas as pd

from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from utils.utils import add_key_value_to_mat

In [3]:
dataset_name = 'iDrug'

disease_emb_file_llm = f'data/{dataset_name}/{dataset_name}_disease_embedding_llm.csv'
drug_emb_file_llm = f'data/{dataset_name}/{dataset_name}_drug_embedding_llm.csv'

disease_emb_file_kg = f'data/{dataset_name}/{dataset_name}_disease_embedding_kg.csv'
drug_emb_file_kg = f'data/{dataset_name}/{dataset_name}_drug_embedding_kg.csv'

In [4]:
disease_emb_df_llm = pd.read_csv(disease_emb_file_llm)
disease_emb_df_llm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.050776,0.027389,-0.085385,-0.059408,-0.045792,0.026192,0.017785,0.07169,0.009059,0.013655,...,0.01445,-0.013695,-0.025918,-0.015833,-0.007691,-0.000364,0.064588,-0.01238,-0.011527,0.003107
1,0.010452,-0.023509,-0.09335,-0.049718,-0.037584,-0.029595,0.001662,0.10423,0.052077,-0.006538,...,0.007289,0.033304,-0.027047,0.004988,-0.01707,-0.003222,0.017736,0.03047,-4e-06,-0.043556
2,0.011881,-0.029443,-0.061471,-0.044625,-0.077496,-0.021725,0.020373,0.081649,0.017572,-0.026446,...,-0.006513,0.024781,0.000631,0.022214,0.016005,-0.015133,0.03912,0.048307,-0.081257,0.006029
3,-0.018325,-0.018045,-0.045884,-0.008926,-0.042838,-0.027183,0.02969,0.096395,0.048737,-0.000778,...,-0.007114,0.01525,-0.015761,-0.034182,-0.017313,-0.021091,-0.004176,-0.014189,-0.011114,0.009485
4,-0.012276,-0.023644,-0.04976,-0.007027,-0.059342,-0.010615,-0.011407,0.084531,0.039252,-0.021847,...,-0.018998,0.030366,-0.051267,-0.001468,0.013522,-0.053431,0.010904,-0.00636,-0.064828,0.029188


In [5]:
drug_emb_df_llm = pd.read_csv(drug_emb_file_llm)
drug_emb_df_llm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.023761,0.002709,-0.047252,-0.03485,-0.040568,-0.026794,0.066686,0.087395,0.056177,0.027509,...,0.03769,0.012818,-0.020091,0.007466,-0.018855,-0.01756,0.027451,-0.016585,0.065179,0.021482
1,-0.038531,0.018251,-0.077256,-0.080736,-0.025365,-0.039749,0.019691,0.12907,0.049261,0.002071,...,0.023664,0.001008,0.035051,0.026989,-0.015099,-0.031513,0.057343,0.023606,-0.009995,0.013166
2,-0.023961,-0.001277,-0.07447,-0.071618,-0.035276,-0.094704,0.017752,0.065951,0.014015,0.01045,...,0.027575,0.005453,-0.024075,0.060284,0.028145,-0.015794,-0.007792,0.025045,0.039441,0.010184
3,0.025736,-0.030559,-0.065053,-0.05749,-0.086351,-0.009685,0.038661,0.093682,0.056641,0.016784,...,0.015231,-0.004519,-0.001429,0.023922,-0.004666,-0.019736,0.022533,0.032816,-0.024424,0.012048
4,-0.020634,0.030419,-0.057463,-0.048322,-0.063039,-0.066984,-0.01502,0.114016,0.012308,-0.007984,...,0.015124,-0.019799,0.018377,0.037664,0.008354,-0.036014,0.047336,-0.009663,-0.023365,0.006903


In [6]:
disease_emb_df_kg = pd.read_csv(disease_emb_file_kg)
disease_emb_df_kg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.042819,0.019587,0.021835,-0.065546,-0.097492,-0.110057,-0.093956,-0.07086,-0.005295,0.0232,...,-0.122893,0.105927,-0.018829,0.053347,0.131469,0.018592,0.081929,-0.126271,-0.009971,-0.107127
1,0.092903,-0.05821,-0.053728,-0.110415,0.016019,0.107386,-0.092139,-0.055179,0.115341,0.121444,...,-0.152217,0.13006,-0.123285,-0.110793,0.006118,0.079664,-0.006056,-0.061849,-0.109642,0.05207
2,0.025672,0.051487,0.105357,0.08573,-0.028384,0.077779,-0.132957,0.041709,0.073524,-0.08306,...,-0.07049,-0.021706,-0.120221,-0.040715,0.147253,-0.024546,-0.110789,-0.142584,0.127281,0.006239
3,-0.101121,-0.051122,-0.08556,-0.05264,0.100229,0.096592,0.00395,0.09729,0.046156,0.068706,...,0.140096,-0.124377,-0.060231,-0.144415,-0.048132,-0.026528,0.086677,-0.090442,0.139299,0.08772
4,-0.053272,0.054441,0.076613,-0.055496,-0.046122,-0.052633,-0.153342,0.135744,0.040873,-0.02766,...,0.013104,-0.039769,-0.097765,-0.145476,-0.023161,-0.09169,0.043037,0.043663,-0.002354,-0.095108


In [7]:
drug_emb_df_kg = pd.read_csv(drug_emb_file_kg)
drug_emb_df_kg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.042819,0.019587,0.021835,-0.065546,-0.097492,-0.110057,-0.093956,-0.07086,-0.005295,0.0232,...,-0.122893,0.105927,-0.018829,0.053347,0.131469,0.018592,0.081929,-0.126271,-0.009971,-0.107127
1,-0.101121,-0.051122,-0.08556,-0.05264,0.100229,0.096592,0.00395,0.09729,0.046156,0.068706,...,0.140096,-0.124377,-0.060231,-0.144415,-0.048132,-0.026528,0.086677,-0.090442,0.139299,0.08772
2,0.056773,0.082533,-0.077263,0.095912,0.109043,-0.031878,0.052879,0.020745,-0.131953,-0.042897,...,0.129152,0.023077,-0.00244,0.013331,0.134417,-0.151635,0.116206,-0.069878,-0.107293,-0.119796
3,0.079398,-0.074269,-0.021165,-0.002437,0.057135,-0.064971,0.013669,-0.030797,0.149805,0.017321,...,0.063156,0.008089,0.13764,-0.015574,-0.011778,0.039605,0.043447,0.098194,-0.116403,0.104637
4,-0.060851,0.112325,-0.050144,0.031543,0.087084,-0.094419,-0.046441,0.070325,0.075493,-0.117555,...,0.109962,-0.091538,0.146765,-0.054461,0.073533,-0.122256,0.120194,0.104693,-0.01687,0.01366


In [8]:
disease_emb_df_llm.shape, drug_emb_df_llm.shape, disease_emb_df_kg.shape, drug_emb_df_kg.shape

((3966, 1024), (1321, 1024), (3966, 128), (1321, 128))

In [10]:
def compute_cosine_similarity_exclude_zeros(emb_df):
    """
    Compute the cosine similarity of an embedding matrix, excluding all-zero rows, and handle diagonal values.

    Parameters:
    - emb_df: pd.DataFrame, an entity embedding matrix (n, d), where each row represents a d-dimensional embedding vector of an entity

    Returns:
    - full_similarity_matrix: np.ndarray, a similarity matrix of shape (n, n) with values in the range [0, 1]
    """
    # 1. Extract the embedding matrix
    emb_matrix = emb_df.values  # Convert DataFrame to NumPy array

    # 2. Check for all-zero rows
    zero_rows = np.all(emb_matrix == 0, axis=1)  # Shape: (n,)

    # 3. Filter out all-zero rows, using only non-zero rows for similarity calculation
    non_zero_matrix = emb_matrix[~zero_rows]

    # 4. If the number of non-zero vectors is less than 2, return a diagonal matrix with ones
    if non_zero_matrix.shape[0] < 2:
        full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))
        np.fill_diagonal(full_similarity_matrix, 1)
        return full_similarity_matrix

    # 5. Compute the cosine similarity matrix for the filtered non-zero rows
    similarity = cosine_similarity(non_zero_matrix)  # Compute similarity only between non-zero rows

    # 6. Scale cosine similarity values to the range [0, 1]
    similarity_adjusted = MinMaxScaler().fit_transform(similarity)

    # 7. Construct the full similarity matrix
    full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))  # Initialize an all-zero matrix

    # 8. Use boolean indexing to fill the similarity values back into the full matrix for non-zero rows and columns
    non_zero_indices = np.where(~zero_rows)[0]  # Extract indices of non-zero rows
    for i, row_idx in enumerate(non_zero_indices):
        for j, col_idx in enumerate(non_zero_indices):
            full_similarity_matrix[row_idx, col_idx] = similarity_adjusted[i, j]

    # 9. Set diagonal values to 1 for all rows (including zero rows)
    np.fill_diagonal(full_similarity_matrix, 1)

    return full_similarity_matrix

In [11]:
def compute_spearman_similarity_exclude_zeros(emb_df):
    """
    Compute the Spearman correlation of an embedding matrix, excluding all-zero rows, and handle diagonal values.

    Parameters:
    - emb_df: pd.DataFrame, an entity embedding matrix (n, d), where each row represents a d-dimensional embedding vector of an entity

    Returns:
    - full_similarity_matrix: np.ndarray, a similarity matrix of shape (n, n) with values in the range [-1, 1]
    """
    # 1. Extract the embedding matrix
    emb_matrix = emb_df.values  # Convert DataFrame to NumPy array

    # 2. Check for all-zero rows
    zero_rows = np.all(emb_matrix == 0, axis=1)  # Shape: (n,)

    # 3. Filter out all-zero rows, using only non-zero rows for similarity calculation
    non_zero_matrix = emb_matrix[~zero_rows]

    # 4. If the number of non-zero vectors is less than 2, return a diagonal matrix with ones
    if non_zero_matrix.shape[0] < 2:
        full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))
        np.fill_diagonal(full_similarity_matrix, 1)
        return full_similarity_matrix

    # 5. Compute the Spearman correlation matrix for the filtered non-zero rows
    similarity = np.zeros((non_zero_matrix.shape[0], non_zero_matrix.shape[0]))
    for i in range(non_zero_matrix.shape[0]):
        for j in range(non_zero_matrix.shape[0]):
            if i != j:
                corr, _ = spearmanr(non_zero_matrix[i], non_zero_matrix[j])
                similarity[i, j] = corr

    # 6. Construct the full similarity matrix
    full_similarity_matrix = np.zeros((emb_matrix.shape[0], emb_matrix.shape[0]))  # Initialize an all-zero matrix

    # 7. Use boolean indexing to fill the similarity values back into the full matrix for non-zero rows and columns
    non_zero_indices = np.where(~zero_rows)[0]  # Extract indices of non-zero rows
    for i, row_idx in enumerate(non_zero_indices):
        for j, col_idx in enumerate(non_zero_indices):
            full_similarity_matrix[row_idx, col_idx] = similarity[i, j]

    # 8. Set diagonal values to 1 for all rows (including zero rows)
    np.fill_diagonal(full_similarity_matrix, 1)

    return full_similarity_matrix

In [13]:
drug_sim_llm = compute_cosine_similarity_exclude_zeros(drug_emb_df_llm)
disease_sim_llm = compute_cosine_similarity_exclude_zeros(disease_emb_df_llm)

drug_sim_kg = compute_cosine_similarity_exclude_zeros(drug_emb_df_kg)
disease_sim_kg = compute_cosine_similarity_exclude_zeros(disease_emb_df_kg)

In [14]:
drug_sim_llm.shape, disease_sim_llm.shape, drug_sim_kg.shape, disease_sim_kg.shape

((1321, 1321), (3966, 3966), (1321, 1321), (3966, 3966))

In [15]:
pd.DataFrame(drug_sim_llm).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1311,1312,1313,1314,1315,1316,1317,1318,1319,1320
0,1.0,0.257943,0.351178,0.382446,0.242623,0.335361,0.306297,0.215755,0.195768,0.170808,...,0.197304,0.172222,0.244902,0.330019,0.374987,0.249916,0.170026,0.247444,0.342067,0.318588
1,0.287479,1.0,0.213079,0.331431,0.551848,0.285123,0.302825,0.214203,0.16852,0.191569,...,0.274989,0.183067,0.184653,0.282074,0.308835,0.254541,0.40193,0.275696,0.229671,0.300575
2,0.361211,0.193132,1.0,0.314088,0.148384,0.281473,0.314371,0.192002,0.193936,0.210651,...,0.161365,0.207881,0.169827,0.254429,0.294881,0.207639,0.157394,0.269258,0.250298,0.215416
3,0.39676,0.319855,0.319463,1.0,0.266519,0.342197,0.449494,0.199012,0.130647,0.204383,...,0.156965,0.213765,0.248649,0.177504,0.362161,0.260527,0.269767,0.363156,0.262807,0.347103
4,0.25711,0.542199,0.151553,0.263477,1.0,0.256477,0.388068,0.194973,0.098573,0.211945,...,0.18908,0.192192,0.162339,0.156723,0.404349,0.21703,0.314254,0.229301,0.293006,0.275302
5,0.362813,0.286241,0.300333,0.354404,0.273288,1.0,0.289365,0.181354,0.252768,0.229648,...,0.13196,0.246809,0.175151,0.230374,0.435359,0.321632,0.367664,0.294889,0.353707,0.249112
6,0.227733,0.191695,0.224733,0.372606,0.30548,0.174798,1.0,0.093623,0.04888,0.119662,...,0.052885,0.173588,0.077498,0.150572,0.255823,0.205346,0.246369,0.195257,0.154113,0.314308
7,0.261888,0.229771,0.227589,0.228243,0.227554,0.196315,0.233725,1.0,0.433039,0.464667,...,0.130169,0.24156,0.167456,0.171034,0.214232,0.192393,0.126777,0.30328,0.252481,0.232766
8,0.257192,0.200191,0.243807,0.177993,0.151186,0.280104,0.210891,0.443612,1.0,0.278594,...,0.135475,0.233848,0.201562,0.20973,0.228701,0.242253,0.196896,0.221544,0.260843,0.256655
9,0.13729,0.124024,0.165845,0.152581,0.164102,0.163977,0.177256,0.408216,0.187367,1.0,...,0.162617,0.160549,0.348863,0.07301,0.176948,0.237435,0.114654,0.181965,0.102842,0.273454


In [16]:
pd.DataFrame(disease_sim_llm).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3956,3957,3958,3959,3960,3961,3962,3963,3964,3965
0,1.0,0.274296,0.248969,0.200785,0.400967,0.317172,0.338688,0.192265,0.169746,0.308986,...,0.253723,0.27566,0.212067,0.203241,0.393401,0.201653,0.260877,0.151658,0.317736,0.190684
1,0.308417,1.0,0.366442,0.257731,0.389078,0.367698,0.227407,0.499388,0.306786,0.430452,...,0.303622,0.246742,0.462113,0.178705,0.374751,0.273325,0.237845,0.298589,0.257561,0.20742
2,0.291502,0.372834,1.0,0.302749,0.300462,0.345028,0.140931,0.387894,0.4605,0.308956,...,0.383582,0.29659,0.392543,0.33437,0.265326,0.398064,0.284001,0.424827,0.343831,0.261336
3,0.244591,0.263801,0.301402,1.0,0.195303,0.21596,0.301033,0.244247,0.337116,0.275823,...,0.294305,0.315041,0.296527,0.379231,0.317819,0.453275,0.295338,0.409885,0.299122,0.252983
4,0.415273,0.374246,0.276176,0.16897,1.0,0.329851,0.382925,0.248991,0.288364,0.293568,...,0.178706,0.2786,0.27461,0.179454,0.290146,0.24368,0.204652,0.234827,0.240451,0.115189


In [17]:
add_key_value_to_mat(f'data/{dataset_name}/{dataset_name}.mat', f'data/{dataset_name}/{dataset_name}.mat', 'drug_LlmS', drug_sim_llm)
add_key_value_to_mat(f'data/{dataset_name}/{dataset_name}.mat', f'data/{dataset_name}/{dataset_name}.mat', 'disease_LlmS', disease_sim_llm)

Successfully added key 'drug_LlmS' to the .mat file and saved it as data/iDrug/iDrug.mat
Successfully added key 'disease_LlmS' to the .mat file and saved it as data/iDrug/iDrug.mat


In [18]:
add_key_value_to_mat(f'data/{dataset_name}/{dataset_name}.mat', f'data/{dataset_name}/{dataset_name}.mat', 'drug_KgS', drug_sim_kg)
add_key_value_to_mat(f'data/{dataset_name}/{dataset_name}.mat', f'data/{dataset_name}/{dataset_name}.mat', 'disease_KgS', disease_sim_kg)

Successfully added key 'drug_KgS' to the .mat file and saved it as data/iDrug/iDrug.mat
Successfully added key 'disease_KgS' to the .mat file and saved it as data/iDrug/iDrug.mat
