<a href="https://colab.research.google.com/github/pikanaeri/plm-model-comparison/blob/main/phrog-embedding-figures/UMAP_Projection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Installing and importing UMAP (https://pypi.org/project/umap-learn/)
pip install umap-learn[plot]
pip install holoviews
pip install -U ipykernel

In [None]:
#@title Importing Dependencies
import umap
import pickle
from ast import literal_eval
import pandas as pd
import numpy as np
import random
import os

from sklearn import metrics
from sklearn.cluster import SpectralClustering

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import textwrap

from scipy.spatial.distance import squareform

from sklearn.manifold import TSNE
import umap

phrog_palette = {
    'DNA, RNA and nucleotide metabolism': 'red',
    'connector': 'blue',
    'head and packaging': 'green',
    'integration and excision': 'pink',
    'lysis': 'gray',
    'moron, auxiliary metabolic gene and host takeover': 'brown',
    'other': 'purple',
    'tail': 'darkorange',
    'transcription regulation': 'cyan'
}

In [None]:
#@title Reading in PHROGs Information
#@markdown * This code will take the labels from the PHROGs information list and place them in a dictionary mapping the PHROG number to the PHROG annotation
#@markdown * Download PHROG_index.tsv here: https://storage.googleapis.com/plm-model-comparison/PHROG_index.tsv

f = open("PHROG_index.tsv", "r")
labels = f.readline().strip().split("\t")

#label for each phrog family
phrog_dict = {}
#counts for each phrog family
phrog_count = {}
phrog_cnt = 0

for line in f:
  information = line.strip().split("\t")
  nm = int(information[0].split("phrog_")[1])
  phrog_dict[nm] = information[6]
  if information[6] in phrog_count:
    phrog_count[information[6]] += 1
  else:
    phrog_count[information[6]] = 1
  phrog_cnt += 1

for i in phrog_count:
  print(i, phrog_count[i])
print("total ", phrog_cnt)

f.close()

In [None]:
#@title Reading in PHROGs Embeddings
#@markdown * This code will go through all of the embedding data
#@markdown * Once the embedding vectors are created and averaged, store them into a final_average_embeddings folder and upload them here

p_num = []
#embedding vectors
vectors = []
#functional category labels
labels = []

#actual count of each phrog family per category
act_phrog_count = {}
cnt = 0
os.chdir("final_average_embeddings")

for i in os.listdir():
  if i.endswith(".pkl"):
    f2 = open(i, "rb")
    i2 = i.replace(".pkl", "").replace("phrog_", "").replace("_averaged", "")
    num = int(i2)
    if phrog_dict[num] in act_phrog_count:
      act_phrog_count[phrog_dict[num]] += 1
    else:
      act_phrog_count[phrog_dict[num]] = 1
    if phrog_dict[num] == "unknown function":
      f2.close()
      continue
    p_num.append(num)
    vectors.append(pickle.load(f2))
    labels.append(phrog_dict[num])
    f2.close()
    cnt += 1

print("total ", cnt)

for i in act_phrog_count:
  print(i, act_phrog_count[i])

labels = np.array(labels)
vectors = np.array(vectors)

In [None]:
#@title Creating and Saving Figure

umap_2d = umap.UMAP(n_components=2, init='random', random_state=123)
umapper = umap_2d.fit_transform(vectors)

df_u = pd.DataFrame()
df_u['label'] = labels
df_u['comp1'] = umapper[:,0]
df_u['comp2'] = umapper[:,1]
plt.rcParams.update({'font.size': 24})
plt.figure(figsize=(24,12))
ax = sns.scatterplot(x='comp1', y='comp2', hue=df_u.label.tolist(),
                palette=phrog_palette,
                legend=False,
                s=40,
                data=df_u)


plt.tight_layout()
plt.savefig('family_vector_centroid_umap.png', dpi=300)

plt.show()