In [1]:
!pip uninstall umap
!pip install umap-learn

[0mCollecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import umap.umap_ as umap
import plotly.express as px
from tqdm.auto import tqdm

In [6]:
tokenizer = AutoTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
model = AutoModel.from_pretrained('michiyasunaga/BioLinkBERT-base')

def get_embeddings(texts, batch_size=32):
    model.eval()
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embeddings)

    return np.vstack(embeddings)

df = pd.read_parquet('/kaggle/input/bioassay-ner/ner_data_raw.prqt')
texts = df['description'].tolist()

embeddings = get_embeddings(texts)

  0%|          | 0/253 [00:00<?, ?it/s]

In [9]:
umap_reducer = umap.UMAP(n_components=2)
umap_embeddings = umap_reducer.fit_transform(embeddings)

In [12]:
umap_df = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
umap_df['description'] = texts
umap_df = umap_df.query('15 > x > 0 and y > 0')

fig = px.scatter(umap_df, x='x', y='y', hover_data=['description'], title='Assay descriptions space visualization')
fig.update_traces(marker=dict(size=5, opacity=0.3))
fig.update_layout(
    width=700,
    height=700,
    xaxis=dict(
        title=None,
        showticklabels=False
    ),
    yaxis=dict(
        title=None,
        showticklabels=False
    )
)
fig.show()

In [14]:
fig.write_html("umap_visualization.html")