In [None]:
cabal = ['Aaron Morris',
         'Alessandro Contini',
         'Alpha Lee',
         'Andre Schützer de Godoy',
         'Andrea Rizzi',
         'Anna Carbery',
         'Annette von Delft',
         'Anthony Aimon',
         'Austin Clyde',
         'Daren Fearon',
         'Demetri Moustakas',
         'Ed Griffen',
         'Edward Griffen',
         'Frank von Delft',
         'Garrett M. Morris',
         'Garrett Morris',
         'Halina Mikolajek',
         'Hannah Bruce Macdonald',
         'Hannah Bruce-Macdonald',
         'Jason Cole',
         'Jason Pattis',
         'Jenke Scheen',
         'Jin Pan',
         'John Chodera',
         'Kadi Liis Saar',
         'Lizbé Koekemoer',
         'Mark Calmiano',
         'Mark Williamson',
         'Matt Hurley',
         'Matt Robinson',
         'Matt Wittmann',
         'Matteo Ferla',
         'Matthew Hurley',
         'Matthew Robinson',
         'Matthew Wittmann',
         'Med-Chem team',
         'Melissa Boby',
         'Michael Retchin',
         'Mihaela Smilova',
         'Milan Cvitkovic',
         'Nir London',
         'Rajath Salegame',
         'Richard Foster',
         'Robert Glen',
         'Ruby Pai',
         'Tim Dudgeon',
         'Tyler Gorrie-Stone',
         'Vincent Voelz',
         'Warren Thompson',
         'Will Glass',
         'William Glass',
         'William McCorkindale',
         'Yuanqing Wang']

import pandas as pd
import numpy as np

_m = pd.read_pickle('moonshot_submissions.p')

moonshot = _m.loc[_m.okay][['CID (canonical)','CID_group', 'old_index', 'clean_creator', 'SMILES', 'new_smiles',
                           'fragments', 'xcode', 'Structure ID', 'xcode','site_name', 'pdb_entry',
                            'series',
                            'ORDERED', 'MADE', 'ASSAYED', 'in_fragalysis',
                            'IC50', 'pIC50',
                           'submission_date', 'inferred_submission_date', 'order_date', 'shipment_date', 
                           'description', 'initial_screen',
                           'N_creator_submission', 'N_submission_group', 'resubmitted',
                           'Enamine - REAL Space', 'Enamine - Extended REAL Space',
                           'Enamine - SCR', 'Enamine - BB', 'Mcule', 'Mcule Ultimate',
                           'N_chars', 'N_words', 'N_words_cutoff', 'classified_method', 'flesch',
                           'dale_chall']]

del _m

on_slack = moonshot.clean_creator.str.lower().isin(list(map(str.lower, cabal)))
moonshot['internal'] = on_slack

In [None]:
## Expanding to middle 3-letter is misleading and does not improve much

moonshot.loc[moonshot.clean_creator == 'anonymous']['CID (canonical)'].str.extract(r'(?P<prefix>\w{3}-\w{3})').value_counts().head(10)
w_mid_prefix = moonshot['CID (canonical)'].str.extract(r'\w{3}-(?P<mid_prefix>\w{3})').isin( 'MED POS MSK DIA LEE XCH WEI UCB LON'.split() ).mid_prefix
# moonshot['internal'] = on_slack | w_mid_prefix

`moonshot.series.str.contains('Ugi')` is not a good marker for prior SARS inhibitor.


While, the word `SARS inhibitor` is surprisingly good.

NB. that some Alpha Lee submissions are formatted oddly and this includes `ALP-POS-c59291d4-5`, hence the comma replacement.

In [None]:
import random
from datetime import timedelta
# 2sigma (95%) = 1 week
moonshot['shipment_date_wobbled'] = moonshot.shipment_date.apply(lambda d:  d + timedelta(days=random.gauss(0, 3.5)))
moonshot['quarter (shipment)'] = moonshot.shipment_date.apply(lambda d: 1+int(d.month // 3 + 4 * (d.year -2020)) if str(d.month) != 'nan' else -1)
moonshot['prior'] = moonshot.description.str.lower().str.replace(',','').str.contains('sars inhibitor')
moonshot['category'] = (moonshot.prior.astype(int) / 2 + moonshot.internal.astype(int)).map({1.: 'Core', 1.5: 'Prior SARS inhibitor', 0.: 'External', .5: 'Prior SARS inhibitor'})
moonshot['short'] = moonshot.description.str.slice(0,50)

In [None]:
import plotly.express as px

fig = px.scatter(moonshot,
          template="plotly_white",
                hover_data=['CID (canonical)', 'clean_creator', 'shipment_date', 'site_name', 'short'],
                x='shipment_date_wobbled',
                y='pIC50', color='category', opacity=0.5,
                title='Distribution of potency of compounds submitted by core team vs. greater scientific community',
               )
fig.write_image("core-v-comm_distro.jpg")
fig

In [None]:
import plotly.express as px

fig = px.violin(moonshot.loc[moonshot['quarter (shipment)'] < 5],
          template="plotly_white",
                hover_data=['CID (canonical)', 'clean_creator', 'shipment_date', 'site_name'],
                x='quarter (shipment)',
                y='pIC50', color='category', box=True, points='all',
                title='Distribution of potency of compounds submitted in 2020 by core team vs. greater scientific community',
               )
fig.write_image("core-v-comm_vio2020.jpg")
fig.update_traces(spanmode = 'hard')
fig