In [None]:
import pandas as pd
import numpy as np

import pandas as pd
import numpy as np
import gzip

with gzip.open('moonshot_submissions.pkl.gz', 'rb') as fh:
    _m = pd.read_pickle(fh)

moonshot = _m.loc[_m.okay][['CID (canonical)','CID_group', 'old_index', 'clean_creator', 'internal', 'SMILES', 'new_smiles',
                           'fragments', 'xcode', 'Structure ID', 'xcode','site_name', 'pdb_entry',
                            'series',
                            'postera_SAScore', 'postera_minNumSteps',
                            'purchasable', 'SAScore', 'midprefix', 'category', 'quarter (shipment)', 'quarter (submission)',
                            'ORDERED', 'MADE', 'ASSAYED', 'in_fragalysis',
                            'IC50', 'pIC50', 'purchasable',
                           'submission_date', 'inferred_submission_date', 'order_date', 'shipment_date', 
                           'description', 'initial_screen',
                           'N_creator_submission', 'N_submission_group', 'resubmitted',
                           'Enamine - REAL Space', 'Enamine - Extended REAL Space',
                           'Enamine - SCR', 'Enamine - BB', 'Mcule', 'Mcule Ultimate',
                           'N_chars', 'N_words', 'N_words_cutoff', 'classified_method', 'flesch',
                           'dale_chall']]

del _m

In [None]:
moonshot['short'] = moonshot.description.str.slice(0,50)
moonshot['pIC50_str'] = moonshot.pIC50.astype(str)

In [None]:
quad = moonshot.loc[moonshot['postera_minNumSteps'] <= 4]

In [None]:
made = moonshot.loc[~moonshot.pIC50.isna()]

In [None]:
import umap
from functools import partial
from rdkit import Chem
from rdkit.Chem import AllChem
import pandera.typing as pdt
import numpy.typing as npt

fp: pdt.Series[npt.ArrayLike] = moonshot.SMILES\
             .apply(Chem.MolFromSmiles)\
             .apply(partial(AllChem.GetMorganFingerprintAsBitVect, radius=4, nBits=2**13))\
             .apply(np.array)

# random setting from web...
# Jacard is Tanimoto, which makes sense (metric='euclidean' for a one-hot is bad idea)
model = umap.UMAP(metric = "jaccard",   
                  n_neighbors = 25,
                  n_components = 2,
                  low_memory = False,
                  min_dist = 0.001)
_u: npt.ArrayLike = model.fit_transform(np.stack(fp.values))
moonshot["UMAP_0"]: pdt.Series[float] = _u[:,0]
moonshot["UMAP_1"]: pdt.Series[float] = _u[:,1]

In [None]:
import plotly.express as px

px.scatter(made, 
           "UMAP_0", 
           "UMAP_1",
           hover_data=['CID (canonical)', 'pIC50_str',  'clean_creator', 'site_name', 'short'],
           color='category', 
           opacity=0.1)

In [None]:
random.gauss(0, 1)

In [None]:
import random
import plotly.express as px

moonshot["UMAP_0_wobble"] = moonshot.UMAP_0.apply(lambda x: x + random.gauss(0, 1) )
moonshot["UMAP_1_wobble"] = moonshot.UMAP_1.apply(lambda x: x + random.gauss(0, 1) )

fig = px.scatter(moonshot, 
                   "UMAP_0_wobble", 
                   "UMAP_1_wobble",
                   title=f'Divesity of all compounds\nclustered by Tanimoto similarity (4-jump radius, {2**12} bits)',
                   hover_data=['CID (canonical)', 'pIC50_str',  'clean_creator', 'site_name', 'short'],
                   color='category',
                   category_orders=dict(category=['Community', 'Core']),
                   opacity=0.1)
fig

In [None]:
import chart_studio
chart_studio.tools.set_credentials_file(username='matteoferla', api_key=os.environ['PLOTLY_API_KEY'])
import chart_studio.plotly as studio_pl

studio_pl.plot(fig, filename = 'umap-all-moonshot', auto_open=True) # 524.288 KB limit!

In [None]:
import numpy as np
import seaborn as sns
sns.set_theme(style="ticks")
p = sns.jointplot(data=made, x='UMAP_0', y='UMAP_1', kind="hex",
              color="#4CB391")
p.fig.suptitle('Divesity of synthesised compounds\nclustered by Tanimoto similarity (4-jump radius, 2^11 bits)')
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.90)
p.savefig("images/dim-red_made.png")

In [None]:
import numpy as np
import seaborn as sns
sns.set_theme(style="ticks")
p = sns.jointplot(data=moonshot, 
                  x='UMAP_0', y='UMAP_1', 
                  kind="hex",  # hue= and hex is not supported.
                  color="#4CB391"
                 )
p.fig.suptitle('Divesity of synthesised compounds\nclustered by Tanimoto similarity (4-jump radius, 2^12 bits)')
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.90)
p.savefig("images/dim-red_all.png")

In [None]:
import numpy as np
import seaborn as sns
sns.set_theme(style="ticks")
p = sns.jointplot(data=moonshot, 
                  x='UMAP_0', y='UMAP_1', 
                  kind="kde",
                  hue="category"
                 )
p.fig.suptitle('Divesity of all compounds\nclustered by Tanimoto similarity (4-jump radius, 2^12 bits)')
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.90)
p.savefig("images/dim-red_all-alt.png")

In [None]:
import numpy as np
import seaborn as sns
sns.set_theme(style="ticks")
p = sns.jointplot(data=moonshot, 
                  x='UMAP_0', y='UMAP_1', 
                  kind="kde",
                  hue="MADE"
                 )
p.fig.suptitle('Divesity of synthesised compounds\nclustered by Tanimoto similarity (4-jump radius, 2^12 bits)')
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.90)
p.savefig("images/dim-red-alt2.png")