In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px

In [2]:
df = pd.read_csv("papers_with_keywords_k10.csv")
X  = np.load("paper_embeddings_roberta.npy")

len(df), X.shape


(1992, (1992, 768))

In [3]:
CLUSTER_TO_FACTORY = {
    0: "Materials & Solid Mechanics",
    3: "Thermal-Fluid & Transport Phenomena",
    5: "Advanced Manufacturing & Devices",

    1: "Robotics & Intelligent Systems",
    2: "Robotics & Intelligent Systems",
    9: "Robotics & Intelligent Systems",

    4: "Biomedical & Biomechanical Engineering",
    8: "Biomedical & Biomechanical Engineering",

    6: "Systems, Education & Interdisciplinary",
    7: "Systems, Education & Interdisciplinary",
}

df["refined_factory"] = df["cluster_id"].map(CLUSTER_TO_FACTORY)

df["refined_factory"].value_counts()


refined_factory
Robotics & Intelligent Systems            603
Thermal-Fluid & Transport Phenomena       365
Materials & Solid Mechanics               329
Biomedical & Biomechanical Engineering    302
Systems, Education & Interdisciplinary    227
Advanced Manufacturing & Devices          166
Name: count, dtype: int64

In [5]:
from sklearn.decomposition import PCA
import plotly.express as px

pca = PCA(n_components=2, random_state=42)
Z = pca.fit_transform(X)

df["pca1"] = Z[:, 0]
df["pca2"] = Z[:, 1]

fig = px.scatter(
    df,
    x="pca1",
    y="pca2",
    color="refined_factory",
    hover_data=["Paper Title", "Professor Name (Original)"],
    title="PCA of Paper Embeddings (Colored by Refined Factory)"
)
fig.write_html("pca_interactive_refined_factory.html")

fig.show()


In [8]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
Z_tsne = tsne.fit_transform(X)

df["tsne1"] = Z_tsne[:,0]
df["tsne2"] = Z_tsne[:,1]

fig_tsne = px.scatter(
    df,
    x="tsne1",
    y="tsne2",
    color="refined_factory",
    hover_data=["Paper Title", "Professor Name (Original)"],
    title="t-SNE of Paper Embeddings (Colored by Refined Factory)"
)
fig_tsne.write_html("tsne_interactive_refined_factory.html")

fig_tsne.show()
