In [1]:
from pathlib import Path
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import plotly.express as px

In [10]:
import json
from pathlib import Path
import shutil
import os

In [11]:
def gen_arr(embeddings, seq_id_to_label):
    """
    Iterate over all of the sequence IDs in the given subset of the dataset (embeddings),
    as a nested numpy array. Produce a numpy array of the average embeddings for each
    sequence, as will a list of the labels by looking up the sequence IDs in seq_id_to_label
    Args:
        embeddings (numpy.lib.npyio.NpzFile): Nested numpy array containing embeddings for each sequence ID
        seq_id_to_label (dict[str,str]): Map from sequence ID to classification label
    Returns:
        output (np.array): Average embeddings for each sequence
        labels (list[str])
    """
    keys = embeddings.files
    output, labels = [], []
    for key in keys:
        d = embeddings[key].item()["avg"]
        labels.append(seq_id_to_label[key])
        output.append(d)
    return np.array(output), labels

In [12]:
from flask import Flask, current_app, jsonify, request, render_template, redirect, send_file, url_for

In [13]:
def show_visualization():
    """
    Render PCA visualization
    """
    return render_template("pca_index.html")

In [21]:
def visualize_data():
    """
    Prepare and render an interactive plotly PCA visualization given the following:
        * n_components: Number of PCA components (must be 2 or 3)
        * targets: Labels file
        * input_data: gzipped npz file with sequence embeddings
    """
    
    n_components = 3

    #load labels file
    lookup_d = json.load(open('enzyme_to_class_tape.json'))

    #load npz file
    input_data = np.load('tape.npz', allow_pickle=True)

    

    print("generating dataframes")
    embed_arr, embed_labels = gen_arr(input_data, lookup_d)
    print("generating PCA")
    pca = PCA(n_components=3)
    principal_components = pca.fit_transform(embed_arr)
    principal_df = pd.DataFrame(
        data=principal_components, columns=["pc1", "pc2", "pc3"]
    )
    principal_df["target"] = embed_labels
    print("generating plot")

    # Adjust PCA according to the number of components
    if n_components == 3:
        fig = px.scatter_3d(
            principal_df,
            x="pc1",
            y="pc2",
            z="pc3",
            color="target",
            color_discrete_sequence=px.colors.qualitative.G10,
        )
    if n_components == 2:
        fig = px.scatter(
            principal_df,
            x="pc1",
            y="pc2",
            color="target",
            color_discrete_sequence=px.colors.qualitative.G10,
        )
    
    fig.write_html("templates/pca.html")

    #return redirect(url_for("show_visualization"))
    return

In [22]:
visualize_data()

generating dataframes
generating PCA
generating plot
