# UMAP

In [7]:
from pymongo import MongoClient
from dotenv import load_dotenv
import umap
import numpy as np
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def get_umap_data(embedding_type):
    """
    Fetch protein embeddings from MongoDB database.
    :param embedding_type: the name of the field containing the embeddings.
    :return: a numpy array of shape (n_proteins, embedding_size).
    """
    load_dotenv()
    uri = os.getenv('MONGODB_URI')
    client = MongoClient(uri)
    db = client['proteinExplorer']
    collection = db['protein_embeddings']

    embeddings = []
    for protein in collection.find():
        if embedding_type not in protein:
            continue
        embeddings.append(protein[embedding_type])

    embeddings = np.array(embeddings)
    
    return embeddings

In [5]:
def reduce_embeddings(embeddings, filename):
    """
    Reduce the dimensionality of the embeddings using UMAP and convert them to a json file.
    :param embeddings: a numpy array of shape (n_samples, 1024).
    :return: a numpy array of shape (n_samples, 2).
    """
    reducer = umap.UMAP(n_components=2)
    embeddings_reduced = reducer.fit_transform(embeddings)

    embeddings_reduced_list = embeddings_reduced.tolist()

    # convert the list to json
    embeddings_reduced_json = json.dumps(embeddings_reduced_list)

    with open(filename, 'w') as f:
        f.write(embeddings_reduced_json)

    return embeddings_reduced

## Get Embeddings

In [16]:
# Your data should be in the form of a numpy array of shape (n_samples, 1024)
gist_embeddings = get_umap_data('func_embedding')
seq_embeddings = get_umap_data('seq_embedding')
esm_embeddings = get_umap_data('esm2_embedding')

## Reduce Embeddings and Save as JSON

In [13]:
gist_embeddings_reduced = reduce_embeddings(gist_embeddings, 'gist_embeddings_reduced.json')
seq_embeddings_reduced = reduce_embeddings(seq_embeddings, 'protbert_embeddings_reduced.json')
esm_embeddings_reduced = reduce_embeddings(esm_embeddings, 'esm2_embeddings_reduced.json')