In [None]:
import pandas as pd
import numpy as np

_m = pd.read_pickle('moonshot_submissions.p')



moonshot = _m.loc[_m.okay][['CID (canonical)','CID_group', 'old_index', 'clean_creator', 'SMILES', 'new_smiles',
                           'fragments', 'xcode', 'Structure ID', 'xcode','site_name', 'pdb_entry',
                            'series',
                            'ORDERED', 'MADE', 'ASSAYED', 'in_fragalysis',
                            'IC50', 'pIC50',
                           'submission_date', 'inferred_submission_date', 'order_date', 'shipment_date', 
                           'description', 'initial_screen',
                           'N_creator_submission', 'N_submission_group', 'resubmitted',
                           'Enamine - REAL Space', 'Enamine - Extended REAL Space',
                           'Enamine - SCR', 'Enamine - BB', 'Mcule', 'Mcule Ultimate',
                           'N_chars', 'N_words', 'N_words_cutoff', 'classified_method', 'flesch',
                           'dale_chall']]

#del _m

In [None]:
moonshot['purchasable'] = moonshot[['Enamine - REAL Space', 'Enamine - Extended REAL Space',
               'Enamine - SCR', 'Enamine - BB', 'Mcule', 'Mcule Ultimate']].apply(lambda row: ((row.astype(str) != 'False') & (row.astype(str) != '') & (row.astype(str) != 'nan')).any(), axis=1)

In [None]:
from rdkit import Chem
from rdkit.Contrib.SA_Score import sascorer
sascorer.readFragmentScores()

moonshot['SAScore'] = moonshot.SMILES.apply(Chem.MolFromSmiles).apply(sascorer.calculateScore).fillna(10)

In [None]:
cabal = ['Aaron Morris',
 'Alessandro Contini',
 'Alpha Lee',
 'Andre Schützer de Godoy',
 'Andrea Rizzi',
 'Anna Carbery',
 'Annette von Delft',
 'Anthony Aimon',
 'Austin Clyde',
 'Daren Fearon',
 'Demetri Moustakas',
 'Ed Griffen',
 'Edward Griffen',
 'Frank von Delft',
 'Garrett M. Morris',
 'Garrett Morris',
 'Halina Mikolajek',
 'Hannah Bruce Macdonald',
 'Hannah Bruce-Macdonald',
 'Jason Cole',
 'Jason Pattis',
 'Jenke Scheen',
 'Jin Pan',
 'John Chodera',
 'Kadi Liis Saar',
 'Lizbé Koekemoer',
 'Mark Calmiano',
 'Mark Williamson',
 'Matt Hurley',
 'Matt Robinson',
 'Matt Wittmann',
 'Matteo Ferla',
 'Matthew Hurley',
 'Matthew Robinson',
 'Matthew Wittmann',
 'Med-Chem team',
 'Melissa Boby',
 'Michael Retchin',
 'Mihaela Smilova',
 'Milan Cvitkovic',
 'Nir London',
 'Rajath Salegame',
 'Richard Foster',
 'Robert Glen',
 'Ruby Pai',
 'Tim Dudgeon',
 'Tyler Gorrie-Stone',
 'Vincent Voelz',
 'Warren Thompson',
 'Will Glass',
 'William Glass',
 'William McCorkindale',
 'Yuanqing Wang']

moonshot['internal'] = moonshot.clean_creator.str.lower().isin(list(map(str.lower, cabal)))
moonshot['midprefix'] = moonshot['CID (canonical)'].str.extract(r'\w{3}-(?P<mid_prefix>\w{3})').mid_prefix


In [None]:
moonshot['category'] = moonshot.internal.map({True: 'Core', False: 'Community'})
moonshot['quarter (shipment)'] = moonshot.shipment_date.apply(lambda d: 1+int(d.month // 3 + 4 * (d.year -2020)) if str(d.month) != 'nan' else -1)

In [None]:
moonshot['quarter (submission)'] = moonshot.inferred_submission_date.apply(lambda d: 1+int(d.month // 3 + 4 * (d.year -2020)) if str(d.month) != 'nan' else -1)

In [None]:
import plotly.express as px
from datetime import datetime

fig = px.histogram(moonshot.loc[moonshot.inferred_submission_date < datetime(day=1, month=6, year=2020)],
                   x='SAScore', color='category',
                   title='Synthetic accessibility of compounds submitted in Spring 2020',
                  )
fig.write_image("SAScore_corevcomm_histo.jpg")
fig.show()

In [None]:
import plotly.express as px
from datetime import datetime

fig = px.violin(moonshot,
          template="plotly_white",
                   y='SAScore', x='quarter (submission)', color='category',
                   title='Synthetic accessibility of compounds', box=True,
                #points='all',
                  )
fig.update_layout(xaxis=dict(range=[1.5,7.5]))
fig.update_traces(spanmode = 'hard')
fig.write_image("SAScore_corevcomm_vio.jpg")
fig.show()

In [None]:
fixed_date = moonshot.inferred_submission_date.apply(lambda v: v if isinstance(v, pd.Timestamp) else np.nan)

In [None]:
from datetime import datetime

fig = px.histogram(moonshot.loc[fixed_date < datetime(day=1, month=6, year=2020)],
                   color='purchasable', x='category',
                   title='Catalogue availability of compounds submitted in Spring 2020',
                  )
fig.write_image("catalogue_corevcomm.jpg")
fig.show()

## NB catalogue stops being added after a while

In [None]:
from datetime import datetime

fig = px.histogram(moonshot,
                   color='purchasable', x='quarter (submission)',
                   title='Catalogue availability of compounds submitted in Spring 2020',
                  )
#fig.write_image("catalogue_corevcomm.jpg")
fig.show()

## % purchasable

In [None]:
(pd.pivot_table(moonshot.loc[moonshot['quarter (submission)'] < 3], 
               index='category',
               values='purchasable', 
               aggfunc=sum) / \
pd.pivot_table(moonshot.loc[moonshot['quarter (submission)'] < 3], 
               index='category',
               values='purchasable', 
               aggfunc=len) * 100).round(1).rename(columns={'purchasable': '% purchasable'}).transpose()

In [None]:
baddies = moonshot.sort_values('SAScore', ascending=False).head(20)
baddies['mol'] = baddies.SMILES.apply(Chem.MolFromSmiles)

from rdkit.Chem import Draw

d = baddies.SAScore.round(1).astype(str) + '\n' + baddies.description.str.slice(0,50)
Draw.MolsToGridImage(baddies['mol'], legends=d.to_list())

In [None]:
midfices = moonshot.midprefix.value_counts().to_dict()
common_midfices = [m for m, v in midfices.items() if v > 50]

In [None]:
fig = px.violin(moonshot.loc[moonshot.midprefix.isin(common_midfices)].sort_values('midprefix'), 
                x='midprefix', y='SAScore',
               title='Distribution of SAScore per group (w/ > 50 submissions)'
               )
fig.update_layout(violingap=0)
fig