In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from utils.utils import pjoin, get_project_path

In [None]:
target_id = "CHEMBL262"
desc = "rdkit"
path = pjoin(get_project_path(), "data", f"{target_id}_{desc}.csv")

In [None]:
X = pd.read_csv(path, index_col=0)
X.dropna(axis=0, how="any", inplace=True)

In [None]:
y10 = X['standard_value'] * 1e-9 # to moles
y10 = -np.log10(y10)
y10.dropna(inplace=True)

In [None]:
X.drop(["smiles", "standard_value"], axis=1, inplace=True)

In [None]:
X = X.loc[y10.index]  # Ensure X matches y10 indices

In [None]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_embedded = tsne.fit_transform(X)

# Plot with continuous color scale
plt.figure(figsize=(10, 8))
sc = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y10, cmap='viridis', s=60)
plt.colorbar(sc, label='pIC50')
plt.title(f't-SNE of Molecular Descriptors ({desc})')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.tight_layout()
plt.show()