# Dimensionality Reduction for Embedding.

Contains evaluation of dimensionality reductions of embeddings.

## Setting up Workspace

### Set up GPUs

In [None]:
# GPU information:

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Sep  5 04:31:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In order to use a GPU with your notebook, select the **Runtime > Change runtime** type menu, and then set the hardware accelerator dropdown to GPU.

### High RAM

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


Users who have purchased one of Colab's paid plans have access to high-memory VMs when they are available.

You can see how much memory you have available at any time by running the following code cell. If the execution result of running the code cell below is "Not using a high-RAM runtime", then you can enable a high-RAM runtime via **Runtime > Change runtime** type in the menu. Then select High-RAM in the Runtime shape dropdown. After, re-execute the code cell.

### Set up Libraries

In [None]:
# Installing all library dependencies with their versions.
# This could take up to 3 minutes to run.

!pip install --quiet h5py==3.9.0 numpy==1.25.2 pandas==1.5.3 plotly==5.15.0 psutil==5.9.5 scikit-learn==1.2.2 umap-learn==0.5.5 kaleido==0.2.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:

In [None]:
#if the torch version from the utput above differ from the one from this cell,
#this is the correct one.
import torch
import sklearn
print(sklearn.__version__)
print(torch.__version__)

1.2.2
2.5.1+cu124


## Load Embeddings

In [None]:
import pandas as pd
df = pd.read_hdf('Files/embeddings_full_tSNE_uMAP_01MAR2024.h5', key='embeddings')

In [None]:
df.head()

Unnamed: 0,pmid,title,abstract,language,journal_title,pub_year,authors,predicted_category,full_embeddings,umap_2D_x,umap_2D_y,umap_3D_x,umap_3D_y,umap_3D_z,tsne_2D_x,tsne_2D_y
0,9748443,Effect of slow growth on metabolism of Escheri...,Escherichia coli growing on glucose in minimal...,eng,Journal of bacteriology,1998.0,"Tweeddale H, Notley-McRobb L, Ferenci T",Microbiology,"[[0.04921199, 0.1013429, 0.009529841, -0.08067...",7.770308,7.748135,8.087116,7.735348,6.041772,-18.861538,25.245789
1,10675895,On the optimization of classes for the assignm...,"At present, the assignment of function to nove...",eng,Trends in biotechnology,2000.0,"Kell DB, King RD",unlabeled,"[[0.074717656, 0.12005615, 0.023376802, 0.0167...",3.69291,6.724228,4.727096,6.715599,7.221296,45.593254,63.230408
2,10731098,Assessing the effect of reactive oxygen specie...,A two-dimensional thin-layer chromatographic a...,eng,Redox report : communications in free radical ...,1999.0,"Tweeddale H, Notley-McRobb L, Ferenci T",unlabeled,"[[-0.009071778, 0.013007838, -0.0069063944, -0...",8.485703,8.156181,8.033753,7.959665,5.779803,-18.892046,25.188398
3,10797602,Current awareness on comparative and functiona...,In order to keep subscribers up-to-date with t...,eng,"Yeast (Chichester, England)",2000.0,,Microbiology,"[[-0.07259704, 0.09568493, -0.023760073, -0.06...",3.549895,5.858758,4.250017,5.822019,7.294538,56.503429,61.75153
4,10894722,Global adaptations resulting from high populat...,The scope of population density effects was in...,eng,Journal of bacteriology,2000.0,"Liu X, Ng C, Ferenci T",Microbiology,"[[0.05038802, 0.1111884, -0.044020668, -0.1404...",7.744527,7.660304,8.108534,7.66177,6.14503,-18.486111,25.036968


## kNN Accuracy

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def knn_accuracy(embedding_x, embedding_y, labels, test_size=0.01, random_state=42, n_neighbors=10):
    """
    Trains and evaluates a k-NN classifier on 2D embeddings.

    Parameters:
        embedding_x (array-like): The x-coordinates of the 2D embedding.
        embedding_y (array-like): The y-coordinates of the 2D embedding.
        labels (array-like): The target labels corresponding to each coordinate pair.
        test_size (float): Proportion of the dataset to include in the test split. Default is 0.01.
        random_state (int): Seed for random number generation to ensure reproducibility. Default is 42.
        n_neighbors (int): Number of neighbors to use for the k-NN classifier. Default is 10.

    Returns:
        float: The accuracy of the classifier on the test set.
    """
    # Combine the two dimensions into a single 2D array (each row is a [x, y] pair)
    embeddings_2d = np.column_stack((embedding_x, embedding_y))

    # Split the embeddings and labels into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        embeddings_2d, labels, test_size=test_size, random_state=random_state
    )

    # Create and train the k-NN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='brute', n_jobs=-1)
    knn.fit(X_train, y_train)

    # Evaluate the classifier on the test set
    accuracy = knn.score(X_test, y_test)

    print(f'Accuracy on 2D embeddings: {accuracy:.2f}')
    return accuracy

### tSNE _vs._ UMAP

In [None]:
# for tsne
tse_knn_acc = knn_accuracy(df['tsne_2D_x'], df['tsne_2D_y'], df['predicted_category']);
umap_knn_acc = knn_accuracy(df['umap_2D_x'], df['umap_2D_y'], df['predicted_category']);

Accuracy on 2D embeddings: 0.56
Accuracy on 2D embeddings: 0.54


In [None]:
print(f'tSNE kNN Accuracy: {tse_knn_acc:.2f}')
print(f'UMAP kNN Accuracy: {umap_knn_acc:.2f}')

tSNE kNN Accuracy: 0.56
UMAP kNN Accuracy: 0.54


## kNN Recall

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

def calculate_knn_recall(X_original, X_reduced, k=10, metric='euclidean'):
    """
    Calculates the k-Nearest Neighbors (kNN) recall between original high-dimensional
    embeddings and reduced 2D embeddings.

    Parameters:
    -----------
    X_original : array-like
        Original high-dimensional embeddings. This can be a list of arrays/lists or a
        2D numpy array of shape (n_samples, n_features). If it is a list of arrays/lists,
        they will be vertically stacked.

    X_reduced : array-like
        Reduced embeddings (e.g., 2D UMAP or t-SNE embeddings) as a 2D array of shape
        (n_samples, 2).

    k : int, optional (default=10)
        Number of nearest neighbors to consider.

    metric : str, optional (default='euclidean')
        The distance metric to use when computing nearest neighbors.

    Returns:
    --------
    knn_recall : float
        The kNN recall, which is the proportion of common neighbors between the original
        and reduced spaces.
    """
    # Convert X_original to a numpy array and stack if necessary
    X_original = np.array(X_original)
    # If each element is itself a list/array, stack them vertically to create a 2D array.
    if X_original.ndim == 1 or (X_original.ndim == 2 and isinstance(X_original[0], (list, np.ndarray))):
        X_original = np.vstack(X_original)

    # Ensure X_reduced is a numpy array
    X_reduced = np.array(X_reduced)

    # Compute k nearest neighbors in the original high-dimensional space
    nn_original = NearestNeighbors(n_neighbors=k, metric=metric)
    nn_original.fit(X_original)
    original_distances, original_indices = nn_original.kneighbors(X_original)

    # Compute k nearest neighbors in the reduced 2D space
    nn_reduced = NearestNeighbors(n_neighbors=k, metric=metric)
    nn_reduced.fit(X_reduced)
    reduced_distances, reduced_indices = nn_reduced.kneighbors(X_reduced)

    # Calculate the total count of common neighbors over all samples
    common_neighbors_count = 0
    n_samples = X_original.shape[0]
    for i in range(n_samples):
        original_neighbors = set(original_indices[i])
        reduced_neighbors = set(reduced_indices[i])
        common_neighbors_count += len(original_neighbors.intersection(reduced_neighbors))

    # Compute the kNN recall as the proportion of common neighbors
    knn_recall = common_neighbors_count / (n_samples * k)

    print(f"kNN Recall: {knn_recall:.4f}")
    return knn_recall

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

def calculate_knn_recall(X_original, X_reduced, k=10, metric='euclidean'):
    """
    Calculates the k-Nearest Neighbors (kNN) recall between original high-dimensional
    embeddings and reduced 2D embeddings.

    Parameters:
    -----------
    X_original : array-like
        Original high-dimensional embeddings. This can be a list of arrays/lists or a
        2D numpy array of shape (n_samples, n_features). If it is a list of arrays/lists,
        they will be vertically stacked.

    X_reduced : array-like
        Reduced embeddings (e.g., 2D UMAP or t-SNE embeddings) as a 2D array of shape
        (n_samples, 2).

    k : int, optional (default=10)
        Number of nearest neighbors to consider.

    metric : str, optional (default='euclidean')
        The distance metric to use when computing nearest neighbors.

    Returns:
    --------
    knn_recall : float
        The kNN recall, which is the proportion of common neighbors between the original
        and reduced spaces.
    """
    # Convert X_original to a numpy array and stack if necessary
    X_original = np.array(X_original)

    # Reshape X_original to 2D if necessary
    if X_original.ndim == 3: #Check if the array is 3D
        X_original = X_original.reshape(X_original.shape[0], -1)  # Reshape to 2D

    # If each element is itself a list/array, stack them vertically to create a 2D array.
    if X_original.ndim == 1 or (X_original.ndim == 2 and isinstance(X_original[0], (list, np.ndarray))):
        X_original = np.vstack(X_original)

    # Ensure X_reduced is a numpy array
    X_reduced = np.array(X_reduced)

    # Compute k nearest neighbors in the original high-dimensional space
    nn_original = NearestNeighbors(n_neighbors=k, metric=metric)
    nn_original.fit(X_original)
    original_distances, original_indices = nn_original.kneighbors(X_original)

    # Compute k nearest neighbors in the reduced 2D space
    nn_reduced = NearestNeighbors(n_neighbors=k, metric=metric)
    nn_reduced.fit(X_reduced)
    reduced_distances, reduced_indices = nn_reduced.kneighbors(X_reduced)

    # Calculate the total count of common neighbors over all samples
    common_neighbors_count = 0
    n_samples = X_original.shape[0]
    for i in range(n_samples):
        original_neighbors = set(original_indices[i])
        reduced_neighbors = set(reduced_indices[i])
        common_neighbors_count += len(original_neighbors.intersection(reduced_neighbors))

    # Compute the kNN recall as the proportion of common neighbors
    knn_recall = common_neighbors_count / (n_samples * k)

    print(f"kNN Recall: {knn_recall:.4f}")
    return knn_recall

### tSNE _vs._ UMAP

In [None]:
X_original = df['full_embeddings'].tolist()  # This can be a list of lists/arrays
X_reduced_tsne = df[['tsne_2D_x', 'tsne_2D_y']].values
X_reduced_umap = df[['umap_2D_x', 'umap_2D_y']].values

# Calculate kNN recall:
tsne_recall = calculate_knn_recall(X_original, X_reduced_tsne, k=10)
umap_recall = calculate_knn_recall(X_original, X_reduced_umap, k=10)

kNN Recall: 0.3303
kNN Recall: 0.1389


In [None]:
print(f'tSNE kNN Recall: {tsne_recall:.4f}')
print(f'UMAP kNN Recall: {umap_recall:.4f}')

tSNE kNN Recall: 0.3303
UMAP kNN Recall: 0.1389
