In [1]:
import numpy as np

# Example initialization of a 50-column numpy array with NaN, 0, and 1
np.random.seed(42)  # For reproducible results
data = np.random.choice([np.nan, 0, 1], size=(100, 50))

# Mask NaN values in the data
nan_mask = ~np.isnan(data)

# Initialize the output matrix to store the pairwise highest similarity
num_rows = data.shape[0]
similarity_matrix = np.full((num_rows, num_rows), np.nan)

# Expand dimensions for broadcasting
data_expanded_1 = np.expand_dims(data, axis=1)
data_expanded_2 = np.expand_dims(data, axis=0)
mask_expanded_1 = np.expand_dims(nan_mask, axis=1)
mask_expanded_2 = np.expand_dims(nan_mask, axis=0)

# Calculate valid positions mask
valid_mask = mask_expanded_1 & mask_expanded_2

# Calculate element-wise equality, ignoring NaNs
equality = (data_expanded_1 == data_expanded_2) & valid_mask

# Count the number of valid comparisons for each pair
valid_counts = np.sum(valid_mask, axis=2)

# Calculate the number of equal valid comparisons for each pair
equal_counts = np.sum(equality, axis=2)

# Avoid division by zero by setting valid_counts to 1 where it's zero
valid_counts[valid_counts == 0] = 1

# Calculate similarity as the proportion of equal valid comparisons
similarity = equal_counts / valid_counts

# Assign similarity to the similarity matrix
similarity_matrix = similarity

print(similarity_matrix)

[[1.         0.7        0.5        ... 0.45454545 0.54545455 0.7037037 ]
 [0.7        1.         0.30434783 ... 0.57142857 0.38888889 0.375     ]
 [0.5        0.30434783 1.         ... 0.43478261 0.6        0.54166667]
 ...
 [0.45454545 0.57142857 0.43478261 ... 1.         0.44444444 0.4       ]
 [0.54545455 0.38888889 0.6        ... 0.44444444 1.         0.47619048]
 [0.7037037  0.375      0.54166667 ... 0.4        0.47619048 1.        ]]


In [9]:
data.sum()

nan

In [10]:
import numpy as np
import time

# Example initialization of a 50-column numpy array with NaN, 0, and 1
np.random.seed(42)  # For reproducible results
data = np.random.choice([np.nan, 0, 1], size=(100, 50))

def pairwise_similarity(data):
    num_rows = data.shape[0]
    similarity_matrix = np.full((num_rows, num_rows), np.nan)

    for i in range(num_rows):
        for j in range(i, num_rows):
            row1 = data[i]
            row2 = data[j]
            
            valid_mask = ~np.isnan(row1) & ~np.isnan(row2)
            valid_values_row1 = row1[valid_mask]
            valid_values_row2 = row2[valid_mask]
            
            if valid_values_row1.size == 0:
                similarity = np.nan
            else:
                similarity = np.sum(valid_values_row1 == valid_values_row2) / valid_values_row1.size
            
            similarity_matrix[i, j] = similarity_matrix[j, i] = similarity
    
    return similarity_matrix

def vectorized_similarity(data):
    nan_mask = ~np.isnan(data)
    num_rows = data.shape[0]
    
    data_expanded_1 = np.expand_dims(data, axis=1)
    data_expanded_2 = np.expand_dims(data, axis=0)
    mask_expanded_1 = np.expand_dims(nan_mask, axis=1)
    mask_expanded_2 = np.expand_dims(nan_mask, axis=0)

    valid_mask = mask_expanded_1 & mask_expanded_2
    equality = (data_expanded_1 == data_expanded_2) & valid_mask

    valid_counts = np.sum(valid_mask, axis=2)
    equal_counts = np.sum(equality, axis=2)

    valid_counts[valid_counts == 0] = 1
    similarity = equal_counts / valid_counts

    return similarity

# Measure execution time for pairwise calculation
start_time = time.time()
pairwise_similarity_matrix = pairwise_similarity(data)
pairwise_time = time.time() - start_time

# Measure execution time for vectorized calculation
start_time = time.time()
vectorized_similarity_matrix = vectorized_similarity(data)
vectorized_time = time.time() - start_time

print(f"Pairwise calculation time: {pairwise_time:.4f} seconds")
print(f"Vectorized calculation time: {vectorized_time:.4f} seconds")

# Check if the results are the same
print(f"Are the results the same? {np.allclose(pairwise_similarity_matrix, vectorized_similarity_matrix, equal_nan=True)}")


Pairwise calculation time: 0.0594 seconds
Vectorized calculation time: 0.0020 seconds
Are the results the same? True
