In [25]:
from glob import glob
from os.path import exists, join, basename
from tqdm import tqdm
from json import load, dump
from matplotlib import pyplot as plt
from collections import Counter

from umap import UMAP

import pandas as pd
import numpy as np

import importlib.util
from pathlib import Path

from dotenv import load_dotenv
from openai import OpenAI

import time

# import local wizmap
path = Path.cwd().parent.parent / "notebook_widget" / "wizmap" / "wizmap.py"

spec = importlib.util.spec_from_file_location("wizmap", path)
wizmap = importlib.util.module_from_spec(spec)
spec.loader.exec_module(wizmap)

load_dotenv("../../.env")
client = OpenAI()

# Load Data

In [9]:
# Load data
# note that these files are downloaded from https://drive.google.com/drive/folders/1grx1dcYGW--wzrxnrw0d6pwW3fbigWgg
# where scaffold.csv is the renamed version of PubChem_2M_10motifs.csv
sol_df = pd.read_csv("solubility.csv")
tox_df = pd.read_csv("toxicity.csv")
df = pd.read_csv("scaffold.csv")
# only keep solubility and toxicity
sol_df = sol_df[["Structure", "Solubility"]]
tox_df = tox_df[["Structure", "Toxicity"]]

# build dict lookups
sol_map = dict(zip(sol_df["Structure"].to_numpy(), sol_df["Solubility"].to_numpy()))
tox_map = dict(zip(tox_df["Structure"].to_numpy(), tox_df["Toxicity"].to_numpy()))

# add columns via dict
df["Solubility"] = df["Structure"].map(sol_map)
df["Toxicity"]   = df["Structure"].map(tox_map)

# verify no duplication or merging errors
print(df[["Solubility", "Toxicity"]].isna().sum())

# create cleaned array and delete unused varibles
arr = df.to_numpy()
del df, sol_df,tox_df,sol_map,tox_map
print(arr.shape)

emb = arr[:,0:32]
desc = arr[:, 33:]
print(desc.shape)
del arr

out = []
n = desc.shape[0]
chunk_size = 200000

for i in range(0, n, chunk_size):
    chunk = desc[i:i+chunk_size, :]

    texts = (
        chunk[:, 0].astype(str)
        + "; Scaffold: "   + chunk[:, 1].astype(str)
        + "; Solubility: " + chunk[:, 2].astype(str)
        + "; Toxicity: "   + chunk[:, 3].astype(str)
    )
    out.append(texts)

texts_full = np.concatenate(out)
del texts, chunk, out

Solubility    0
Toxicity      0
dtype: int64
(2000000, 37)
(2000000, 4)


# Dim Reduction

In [10]:
reducer = UMAP(metric="cosine")
embeddings_2d = reducer.fit_transform(emb) # 9-10 minutes, 17 min on low power mode

In [11]:
del emb # remove emb, since we no longer need it

# Wizmap

In [12]:
xs = embeddings_2d[:, 0].astype(float).tolist()
ys = embeddings_2d[:, 1].astype(float).tolist()

In [13]:
del embeddings_2d

In [26]:
# The following is where the values are computed
instructions = """You are a computational chemist analyzing chemical structures given as SMILES strings.

Analyze these structures to identify:
- Structural similarity and shared substructures
- Common functional groups present in the SMILES representations
- Relevant chemical properties (such as solubility and toxicity)
- The chemical rationale for grouping these functional groups together, based on their structural, electronic, and physicochemical characteristics

Instead of explaining each structure individually, focus on the patterns that are highly common across the set. 
Where relevant, include concrete examples of functional groups or substructures inferred from the SMILES strings to illustrate these patterns.

Summarize the common patterns in under 50 words, then list 2-5 key descriptors that best characterize the group.

Provide your response strictly in the following JSON format:
{
  "keywords": string[], // array of key descriptors that best characterize the group
  "summary": string // 50 words or fewer summary of the common structural and chemical patterns
}
"""

structure = desc[:,0]
model_params = {
  "model": "gpt-4o-mini",
  "temperature": 0.3
}
saveDat = wizmap.init_topic_summary_batch(
    xs, ys, structure, instructions, 
    client, "saves.pkl", max_zoom_scale=10, batch_name = "./batches/WM_t-sum",
    send_when_done = False, max_requests_per_batch = 125, openai_model_params=model_params)
#grid_dict = wizmap.generate_grid_dict(xs, ys, structure, instructions, client, "Chemical Structures", max_zoom_scale=10) # replaced by llm format
del structure

Start generating multi-level summary batches...


2000000it [00:15, 130272.10it/s]
Level 1/4: 100%|██████████| 13176/13176 [00:02<00:00, 5819.78it/s]
Level 2/4: 100%|██████████| 3730/3730 [00:01<00:00, 2411.56it/s]
Level 3/4: 100%|██████████| 1137/1137 [00:01<00:00, 937.89it/s] 
Level 4/4: 100%|██████████| 389/389 [00:00<00:00, 392.02it/s]


In [None]:
# can remove this if you set, and can send all at once send_when_done = True
saveDat = wizmap.BatchFileTracker("saves.pkl")
saveDat.resend_until_done(client, sleep_time=120, batches_per_send = 3)

{'None': 140}


Sending batches: 100%|██████████| 10/10 [00:12<00:00,  1.30s/it]


Cycle 1 done
{'failed': 8, 'in_progress': 2, 'None': 130}


Sending batches: 100%|██████████| 10/10 [00:09<00:00,  1.03it/s]


Cycle 2 done
{'failed': 10, 'completed': 2, 'None': 128}


Sending batches: 100%|██████████| 10/10 [00:14<00:00,  1.41s/it]


In [24]:
# check completion
saveDat = wizmap.BatchFileTracker("saves.pkl")
if saveDat.checkCompletion(client, verbose=True):
    saveDat.download_batch_outputs(client)
    print("All done")

wizmap-topic-summaries_43.jsonl failed
Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-snxFEOUudfOLSycGh385r02G. Limit: 2,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list')
wizmap-topic-summaries_49.jsonl failed
Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-snxFEOUudfOLSycGh385r02G. Limit: 2,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list')
wizmap-topic-summaries_50.jsonl failed
Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-snxFEOUudfOLSycGh385r02G. Limit: 2,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object=

In [None]:
grid_dict = wizmap.generate_batch_grid_dict(client, embedding_name="Chemical Structures", savefile="saves.pkl", retrieve_batches = False)
#Create Datalist, may need to rerun section for texts_full if this isn't done continuously
data_list = wizmap.generate_data_list(saveDat.xs, saveDat.ys, texts_full)
del texts_full

In [None]:
wizmap.save_json_files(data_list, grid_dict, output_dir="./")