In [1]:
import numpy as np
import pandas as pd

In [2]:
def min_max_scale_2d_arr(arr: np.array):

    flat_arr = arr.flatten()

    non_inf_mask = flat_arr != float('inf')

    max_val = max(flat_arr[non_inf_mask])
    min_val = min(flat_arr)

    # change inf to zeros, so scaling is not affected
    inf_zscore_idxs = np.where(arr == float('inf'))

    for idx in range(len(inf_zscore_idxs[0])):
        arr[inf_zscore_idxs[0][idx]][inf_zscore_idxs[1][idx]] = 0

    scaled_arr = (arr - min_val) / (max_val - min_val)

    return scaled_arr

In [3]:
clr_df = pd.read_csv('./clr_network_for_distances_6.csv')
clr_df.rename(columns={'Unnamed: 0':'TTHERM_ID'}, inplace=True)
print(clr_df.shape)
clr_df.head()

(19152, 19153)


Unnamed: 0,TTHERM_ID,TTHERM_00161860,TTHERM_00161850,TTHERM_00161840,TTHERM_00161830,TTHERM_00161790,TTHERM_00161780,TTHERM_000161759,YF00000015.t1,TTHERM_00161750,...,YF00038356.t1,YF00038359.t1,YF00038369.t1,TTHERM_01068130,YF00038374.t1,TTHERM_01082890,YF00038376.t1,YF00038377.t1,TTHERM_000989489,YF00038707.t1
0,TTHERM_00161860,0.0,0.0,1.01177,0.544702,3.790811,0.072156,0.0,2.830599,0.0,...,0.000849,0.24706,0.00426,2.553476,0.080188,0.0,0.081121,0.0,0.099867,4.388899
1,TTHERM_00161850,0.0,0.0,0.662679,2.10698,1.319153,0.0,0.285196,0.0,0.018998,...,0.0,3.250603,3.568258,0.908856,0.057345,0.048619,0.217558,1.786403,0.751179,0.0
2,TTHERM_00161840,1.01177,0.662679,0.0,6.703985,3.812577,0.0,2.071413,0.0,2.330468,...,2.817666,3.527004,0.0,4.432487,8.796609,0.0,0.355199,1.059101,0.0,0.981875
3,TTHERM_00161830,0.544702,2.10698,6.703985,0.0,1.513556,0.0,1.680607,0.562822,6.76474,...,1.796626,1.439393,1.498757,2.880878,2.741864,0.801501,0.155717,0.676565,0.396551,0.039557
4,TTHERM_00161790,3.790811,1.319153,3.812577,1.513556,0.0,0.0,4.464332,0.0,0.125314,...,2.46758,0.564686,0.0,4.384075,1.004587,0.0,0.0,0.00011,0.00209,3.042215


In [4]:
zscore_arr = clr_df.loc[:,clr_df.columns[1:]].to_numpy()
zscore_arr

array([[0.        , 0.        , 1.0117703 , ..., 0.        , 0.09986675,
        4.38889906],
       [0.        , 0.        , 0.66267856, ..., 1.78640303, 0.75117937,
        0.        ],
       [1.0117703 , 0.66267856, 0.        , ..., 1.0591007 , 0.        ,
        0.98187478],
       ...,
       [0.        , 1.78640303, 1.0591007 , ..., 0.        , 0.        ,
        0.0452325 ],
       [0.09986675, 0.75117937, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [4.38889906, 0.        , 0.98187478, ..., 0.0452325 , 0.        ,
        0.        ]])

# My solution

In [5]:
scaled_zscore_arr = min_max_scale_2d_arr(zscore_arr)
scaled_zscore_arr

array([[0.        , 0.        , 0.0025553 , ..., 0.        , 0.00025222,
        0.0110845 ],
       [0.        , 0.        , 0.00167365, ..., 0.0045117 , 0.00189716,
        0.        ],
       [0.0025553 , 0.00167365, 0.        , ..., 0.00267484, 0.        ,
        0.0024798 ],
       ...,
       [0.        , 0.0045117 , 0.00267484, ..., 0.        , 0.        ,
        0.00011424],
       [0.00025222, 0.00189716, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0110845 , 0.        , 0.0024798 , ..., 0.00011424, 0.        ,
        0.        ]])

In [6]:
info = np.finfo(np.float64)
smallest_float = info.eps
smallest_float

2.220446049250313e-16

In [7]:
clr_dist_arr = np.sqrt(2 * (1 - scaled_zscore_arr)) + smallest_float # add smallest value possible to avoid zeros
# I based this solution on these two resources:
# https://medium.com/swlh/is-correlation-distance-a-metric-5a383973978f
# https://stats.stackexchange.com/questions/165194/using-correlation-as-distance-metric-for-hierarchical-clustering
clr_dist_arr

array([[1.41421356, 1.41421356, 1.41240553, ..., 1.41421356, 1.4140352 ,
        1.4063538 ],
       [1.41421356, 1.41421356, 1.41302962, ..., 1.4110197 , 1.41287143,
        1.41421356],
       [1.41240553, 1.41302962, 1.41421356, ..., 1.4123209 , 1.41421356,
        1.41245899],
       ...,
       [1.41421356, 1.4110197 , 1.4123209 , ..., 1.41421356, 1.41421356,
        1.41413278],
       [1.4140352 , 1.41287143, 1.41421356, ..., 1.41421356, 1.41421356,
        1.41421356],
       [1.4063538 , 1.41421356, 1.41245899, ..., 1.41413278, 1.41421356,
        1.41421356]])

In [8]:
np.fill_diagonal(clr_dist_arr, 0) # diagonal must be zeros for dist matrix

# Your suggested solution

In [9]:
# 1 / zscore for all zscores != zero
# scale values linearly 0 to 1
# assign 1s to idxs where original zscores == zero

In [10]:
zero_zscore_idxs = np.where(zscore_arr == 0)
inverse_zscore_arr = 1 / zscore_arr

scaled_inverse_zscore_arr = min_max_scale_2d_arr(inverse_zscore_arr)
scaled_inverse_zscore_arr

  inverse_zscore_arr = 1 / zscore_arr


array([[-4.64807962e-20, -4.64807962e-20,  1.81434517e-17, ...,
        -4.64807962e-20,  1.84239419e-16,  4.14683358e-18],
       [-4.64807962e-20, -4.64807962e-20,  2.77257071e-17, ...,
         1.02558044e-17,  2.44537040e-17, -4.64807962e-20],
       [ 1.81434517e-17,  2.77257071e-17, -4.64807962e-20, ...,
         1.73305576e-17, -4.64807962e-20,  1.86972875e-17],
       ...,
       [-4.64807962e-20,  1.02558044e-17,  1.73305576e-17, ...,
        -4.64807962e-20, -4.64807962e-20,  4.06829856e-16],
       [ 1.84239419e-16,  2.44537040e-17, -4.64807962e-20, ...,
        -4.64807962e-20, -4.64807962e-20, -4.64807962e-20],
       [ 4.14683358e-18, -4.64807962e-20,  1.86972875e-17, ...,
         4.06829856e-16, -4.64807962e-20, -4.64807962e-20]])

In [11]:
for idx in range(len(zero_zscore_idxs[0])):
    scaled_inverse_zscore_arr[zero_zscore_idxs[0][idx]][zero_zscore_idxs[1][idx]] = 1
scaled_inverse_zscore_arr

array([[1.00000000e+00, 1.00000000e+00, 1.81434517e-17, ...,
        1.00000000e+00, 1.84239419e-16, 4.14683358e-18],
       [1.00000000e+00, 1.00000000e+00, 2.77257071e-17, ...,
        1.02558044e-17, 2.44537040e-17, 1.00000000e+00],
       [1.81434517e-17, 2.77257071e-17, 1.00000000e+00, ...,
        1.73305576e-17, 1.00000000e+00, 1.86972875e-17],
       ...,
       [1.00000000e+00, 1.02558044e-17, 1.73305576e-17, ...,
        1.00000000e+00, 1.00000000e+00, 4.06829856e-16],
       [1.84239419e-16, 2.44537040e-17, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [4.14683358e-18, 1.00000000e+00, 1.86972875e-17, ...,
        4.06829856e-16, 1.00000000e+00, 1.00000000e+00]])

In [12]:
np.where(scaled_inverse_zscore_arr == 0) # once distance is still zero due to the min max scaling

(array([2284, 2287]), array([2287, 2284]))

In [13]:
np.fill_diagonal(scaled_inverse_zscore_arr, 0) # diagonal must be zeros for dist matrix