# Analysis of the covid moonshot rationale
`COVID_moonshot_submissions` folder is the gitrepo https://github.com/postera-ai/COVID_moonshot_submissions

In [72]:
gitrepo_folder = 'textual-analysis-of-COVID-Moonshot'

In [78]:
import os

import pandas as pd

submission_filename = '../COVID_moonshot_submissions/covid_submissions_all_info.csv'

postera = pd.read_csv(submission_filename)

  interactivity=interactivity, compiler=compiler, result=result)


In [79]:
# the table contains multiple rows/submitted compounds per submission.
# Say the CID `ANT-DIA-3c79be55-1` finishes in `-1`
postera = postera.assign(CID_group=postera.CID.str.extract(r'(.*)\-\d+'))
minpostera = postera.drop_duplicates('CID_group')

## Bag of words

Making a bag of words for a word cloud and to see which terms are enriched

In [80]:
import re
from collections import Counter

word_block = ' '.join(minpostera.rationale) + ' '.join(minpostera['Submission Notes'].astype(str))
word_block = word_block.replace('by eye', 'by-eye').replace("n't", '')
words = re.findall('[\w\-]+', word_block.lower())
wordbag = Counter(words)

In [107]:
wordbag['fragalysis']

21

In [82]:
# common words (`the`, `of` etc.) need to be remove from the wordbag

import requests

r = requests.get('https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt')

common_words = set(r.text.split('\n')) - {'molecule', 'machine', 'dock', 'learn'}

# shoddy way of expanding it.
# `againsted` is not a real word, but it's presence does not harm anyone.
expanded_common_words = set(list(common_words) + \
                            [w+'s' for w in common_words] + \
                            [w+'es' for w in common_words] + \
                            [w+'ed' for w in common_words] + \
                            [w+'ing' for w in common_words] + \
                            [w+'ly' for w in common_words] + \
                            ['http', 'https', 'com', 'org', 'www']
                           )

In [83]:
## remove them

for k in expanded_common_words.intersection(wordbag.keys()):
    del wordbag[k]
    
for k in list(wordbag.keys()):
    if k.isdigit() or len(k) < 3:
        del wordbag[k]

In [84]:
wordbag.most_common()

[('nan', 1061),
 ('fragments', 446),
 ('compounds', 443),
 ('molecules', 385),
 ('fragment', 367),
 ('docking', 326),
 ('structure', 325),
 ('binding', 304),
 ('based', 271),
 ('pocket', 249),
 ('structures', 242),
 ('bond', 226),
 ('molecule', 210),
 ('hydrogen', 192),
 ('covalent', 189),
 ('crystal', 188),
 ('compound', 181),
 ('into', 177),
 ('site', 176),
 ('by-eye', 173),
 ('enamine', 169),
 ('ligand', 167),
 ('docked', 164),
 ('pdb', 156),
 ('available', 153),
 ('protein', 147),
 ('screening', 147),
 ('interaction', 138),
 ('interactions', 129),
 ('library', 129),
 ('non-covalent', 128),
 ('synthesis', 127),
 ('active', 125),
 ('protease', 119),
 ('activity', 118),
 ('predictions', 113),
 ('molecular', 110),
 ('domain', 109),
 ('model', 107),
 ('generated', 106),
 ('amide', 104),
 ('applicability', 104),
 ('predicted', 102),
 ('affinity', 100),
 ('different', 92),
 ('module', 92),
 ('stages', 92),
 ('seesar', 87),
 ('per', 86),
 ('residues', 85),
 ('further', 84),
 ('linking', 84

In [106]:
# make word cloud
with open('wordcloud.txt', 'w') as w:
    w.write('\n'.join([' '.join([k] * wordbag[k]) for k in wordbag if wordbag[k] > 20 and k != 'nan']))

## Find enrichment

In [112]:
from typing import Dict

def get_data_on_term(term:str) -> Dict[str, float]:
    if term == 'by-eye':
        term = 'by eye'
    subtable = postera.loc[postera.rationale.str.contains(term, case=False)]
    return {'term': term, **get_data_on_table(subtable)}
    
def get_data_on_table(table:pd.DataFrame) -> Dict[str, float]:
    data = {}
    data['N'] = len(table)
    if data['N'] == 0:
        print(f'Table failed!')
        return data
    # ordered
    ordered = table.ORDERED.value_counts()
    data['N_ordered'] = ordered[True] if True in ordered else 0
    data['N_not_ordered'] = ordered[False] if False in ordered else 0
    data['freq_ordered (of total)'] = round(data['N_ordered'] / data['N'], 4)
    # made
    made = table.MADE.value_counts()
    data['N_made'] = made[True] if True in made else 0
    data['N_not_made'] = made[False] if False in made else 0
    data['freq_made (of total)'] = round(data['N_made'] / data['N'], 4)
    data['freq_made (of ordered)'] = round(data['N_made']/data['N_ordered'], 4) if data['N_ordered'] != 0 else 0
    # xstalised
    xstalised = table.structure_ID.value_counts().sum()
    data['N_crystallised'] = xstalised
    data['N_not_crystallised'] = data['N'] - xstalised
    data['freq_crystallised (of total)'] = round(xstalised/data['N'], 4)
    data['freq_crystallised (of made)'] = round(xstalised/data['N_made'], 4) if data['N_made'] != 0 else 0
    # assayed
    assayed = table.ASSAYED.value_counts()
    data['N_assayed'] = assayed[True] if True in assayed else 0
    data['N_not_assayed'] = assayed[False] if False in assayed else 0
    data['freq_assayed (of total)'] = round(data['N_assayed'] / data['N'], 4)
    data['freq_assayed (of made)'] = round(data['N_assayed']/data['N_made'], 4)  if data['N_made'] != 0 else 0
    # return
    return data

In [113]:
# pd.DataFrame({'term': pd.Series(data = list(wordbag.keys()), dtype=str),
#               'count': pd.Series(data = list(wordbag.values()), dtype=int)})

In [114]:
data = [get_data_on_term(term) for term in wordbag]

term_table = pd.DataFrame(data)
term_table

Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table 

Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table failed!
Table 

Unnamed: 0,term,N,N_ordered,N_not_ordered,freq_ordered (of total),N_made,N_not_made,freq_made (of total),freq_made (of ordered),N_crystallised,N_not_crystallised,freq_crystallised (of total),freq_crystallised (of made),N_assayed,N_not_assayed,freq_assayed (of total),freq_assayed (of made)
0,nitrile,623,24.0,599.0,0.0385,18.0,605.0,0.0289,0.7500,4.0,619.0,0.0064,0.2222,18.0,605.0,0.0289,1.0
1,x0305,216,68.0,148.0,0.3148,39.0,177.0,0.1806,0.5735,6.0,210.0,0.0278,0.1538,39.0,177.0,0.1806,1.0
2,superimpose,56,13.0,43.0,0.2321,5.0,51.0,0.0893,0.3846,0.0,56.0,0.0000,0.0000,5.0,51.0,0.0893,1.0
3,x1249,188,34.0,154.0,0.1809,22.0,166.0,0.1170,0.6471,4.0,184.0,0.0213,0.1818,22.0,166.0,0.1170,1.0
4,x0434,375,77.0,298.0,0.2053,52.0,323.0,0.1387,0.6753,12.0,363.0,0.0320,0.2308,52.0,323.0,0.1387,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,removal,0,,,,,,,,,,,,,,,
5901,t-butyl,0,,,,,,,,,,,,,,,
5902,plenty,0,,,,,,,,,,,,,,,
5903,raw,151,4.0,147.0,0.0265,1.0,150.0,0.0066,0.2500,0.0,151.0,0.0000,0.0000,1.0,150.0,0.0066,1.0


In [115]:
term_table.to_csv(f'{gitrepo_folder}/term_frequencies.csv')

In [116]:
data.append({'term': '<TALLY>', **get_data_on_table(postera)})
term_table = pd.DataFrame(data)

In [117]:
tally_row = term_table.loc[term_table.term == '<TALLY>'].iloc[0]
tally_row

term_table = term_table[['term', 'N', 
                         'N_ordered', 'N_not_ordered',
                         'N_made', 'N_not_made',
                         'N_assayed', 'N_not_assayed',
                         'N_crystallised', 'N_not_crystallised',
                         'freq_ordered (of total)',
                         'freq_made (of total)', 'freq_assayed (of total)', 'freq_crystallised (of total)',
                         'freq_made (of ordered)', 'freq_assayed (of made)', 'freq_crystallised (of made)'
                        ]]

In [118]:
from scipy.stats import chisquare, nbinom, fisher_exact

new_columns = {}
for field in ('ordered', 'made',  'assayed', 'crystallised'):
    fisherise = lambda row: fisher_exact([[row.N, row[f'N_{field}'] ],
                                          [tally_row.N, tally_row[f'N_{field}'] ]
                                         ])[1] if str(row[f'N_{field}']) != 'nan' else float('nan')
    chiify = lambda row: chisquare(f_obs=[row.N, row[f'N_{field}']],
                                   f_exp=[tally_row.N, tally_row[f'N_{field}'] ]).pvalue
    new_columns[f'fisher_p_{field}'] = term_table.apply(fisherise,axis=1)
    new_columns[f'χ2_p_{field}'] = term_table.apply(chiify,axis=1)

term_table = term_table.assign(**new_columns)

In [119]:
import numpy as np

new_columns = {}
for field in ('freq_ordered (of total)',
              'freq_made (of total)',
              'freq_assayed (of total)', 
              'freq_crystallised (of total)',
              'freq_made (of ordered)', 
              'freq_assayed (of made)',
              'freq_crystallised (of made)'):
    new_columns[f'log2_{field}'] = np.log2(term_table[field].values / tally_row[field])

term_table = term_table.assign(**new_columns)

  # This is added back by InteractiveShellApp.init_path()


In [120]:
term_table.to_csv(f'{gitrepo_folder}/terms.csv')

In [121]:
## Crystallised

total = len(term_table)
crystal = term_table.loc[term_table.fisher_p_crystallised < 0.05/total]\
                    .sort_values('freq_crystallised (of total)', ascending=False)\
                    .head(100)
crystal.to_csv(f'{gitrepo_folder}/Top100_terms_enchriched_for_crystallisation.csv')
crystal

Unnamed: 0,term,N,N_ordered,N_not_ordered,N_made,N_not_made,N_assayed,N_not_assayed,N_crystallised,N_not_crystallised,...,χ2_p_assayed,fisher_p_crystallised,χ2_p_crystallised,log2_freq_ordered (of total),log2_freq_made (of total),log2_freq_assayed (of total),log2_freq_crystallised (of total),log2_freq_made (of ordered),log2_freq_assayed (of made),log2_freq_crystallised (of made)
2741,moonshot,290,57.0,233.0,34.0,256.0,33.0,257.0,98.0,192.0,...,0.0,2.503830e-76,0.0,0.691934,0.382580,0.367650,4.321501,-0.307880,-0.015376,3.933869
3639,cid,378,51.0,327.0,28.0,350.0,27.0,351.0,93.0,285.0,...,0.0,1.402962e-62,0.0,0.148561,-0.278848,-0.304855,3.863563,-0.427596,-0.024771,4.138390
406,submissions,269,69.0,200.0,45.0,224.0,45.0,224.0,52.0,217.0,...,0.0,1.114159e-32,0.0,1.075630,0.896044,0.923587,3.515746,-0.179088,0.027675,2.615241
4398,shot,545,61.0,484.0,37.0,508.0,36.0,509.0,98.0,447.0,...,0.0,6.609641e-55,0.0,-0.121119,-0.404910,-0.416128,3.411298,-0.283657,-0.011813,3.811828
3643,try-uni-714a760b-6,74,55.0,19.0,48.0,26.0,48.0,26.0,12.0,62.0,...,0.0,2.895993e-08,0.0,2.610421,2.850936,2.878478,3.262679,0.241083,0.027675,0.406598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,cysteine,1384,7.0,1377.0,4.0,1380.0,4.0,1380.0,0.0,1384.0,...,0.0,4.380464e-10,0.0,-4.576688,-4.954196,-4.926654,-inf,-0.369902,0.027675,-inf
2099,calculated,2526,3.0,2523.0,1.0,2525.0,1.0,2525.0,0.0,2526.0,...,0.0,2.405097e-17,0.0,-6.664151,-7.812177,-7.784635,-inf,-1.147581,0.027675,-inf
1960,proteases,1274,1.0,1273.0,1.0,1273.0,1.0,1273.0,0.0,1274.0,...,0.0,2.538007e-09,0.0,-7.249113,-6.812177,-6.784635,-inf,0.437526,0.027675,-inf
1959,calculation,770,34.0,736.0,32.0,738.0,32.0,738.0,0.0,770.0,...,0.0,7.994828e-06,0.0,-1.461211,-1.111738,-1.084195,-inf,0.350099,0.027675,-inf


In [122]:
## Crystallised 2

total = len(term_table)
crystal = term_table.loc[term_table.fisher_p_crystallised < 0.05/total]\
                    .sort_values('freq_crystallised (of made)', ascending=False)\
                    .head(100)
crystal.to_csv(f'{gitrepo_folder}/Top100_terms_enchriched_for_crystallisation_of_made.csv')
crystal

Unnamed: 0,term,N,N_ordered,N_not_ordered,N_made,N_not_made,N_assayed,N_not_assayed,N_crystallised,N_not_crystallised,...,χ2_p_assayed,fisher_p_crystallised,χ2_p_crystallised,log2_freq_ordered (of total),log2_freq_made (of total),log2_freq_assayed (of total),log2_freq_crystallised (of total),log2_freq_made (of ordered),log2_freq_assayed (of made),log2_freq_crystallised (of made)
3342,batch,1027,31.0,996.0,14.0,1013.0,14.0,1013.0,91.0,936.0,...,0.0,4.283002e-30,0.0,-2.010709,-2.724714,-2.697172,2.390283,-0.709357,0.027675,5.107038
3639,cid,378,51.0,327.0,28.0,350.0,27.0,351.0,93.0,285.0,...,0.0,1.402962e-62,0.0,0.148561,-0.278848,-0.304855,3.863563,-0.427596,-0.024771,4.138390
2741,moonshot,290,57.0,233.0,34.0,256.0,33.0,257.0,98.0,192.0,...,0.0,2.503830e-76,0.0,0.691934,0.382580,0.367650,4.321501,-0.307880,-0.015376,3.933869
4398,shot,545,61.0,484.0,37.0,508.0,36.0,509.0,98.0,447.0,...,0.0,6.609641e-55,0.0,-0.121119,-0.404910,-0.416128,3.411298,-0.283657,-0.011813,3.811828
3866,treweren,4207,5.0,4202.0,5.0,4202.0,4.0,4203.0,9.0,4198.0,...,0.0,3.229300e-17,0.0,-6.664151,-6.227215,-6.462707,-3.008562,0.437526,-0.294253,3.254595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,primary,1338,5.0,1333.0,5.0,1333.0,5.0,1333.0,0.0,1338.0,...,0.0,1.114396e-09,0.0,-5.039660,-4.602724,-4.575181,-inf,0.437526,0.027675,-inf
1960,proteases,1274,1.0,1273.0,1.0,1273.0,1.0,1273.0,0.0,1274.0,...,0.0,2.538007e-09,0.0,-7.249113,-6.812177,-6.784635,-inf,0.437526,0.027675,-inf
136,centre,2966,23.0,2943.0,20.0,2946.0,20.0,2946.0,0.0,2966.0,...,0.0,3.171292e-20,0.0,-3.963711,-3.746088,-3.718546,-inf,0.235949,0.027675,-inf
970,served,3256,27.0,3229.0,20.0,3236.0,20.0,3236.0,0.0,3256.0,...,0.0,7.755804e-22,0.0,-3.874074,-3.881440,-3.853898,-inf,0.004487,0.027675,-inf


In [127]:
headers = ['term','N','N_ordered','N_made','N_assayed','N_crystallised','fisher_p_made','fisher_p_crystallised','log2_freq_made (of total)','log2_freq_crystallised (of total)','log2_freq_crystallised (of made)']
term_table.loc[term_table.term.isin(['dock', 'score', 'merge', 'enumerate', 'calculate', 'by eye', 'fragalysis'])][headers]#.to_markdown()

Unnamed: 0,term,N,N_ordered,N_made,N_assayed,N_crystallised,fisher_p_made,fisher_p_crystallised,log2_freq_made (of total),log2_freq_crystallised (of total),log2_freq_crystallised (of made)
15,merge,362,52.0,21.0,21.0,7.0,0.05815363,0.6787069,-0.632268,0.191578,0.821492
20,fragalysis,69,5.0,4.0,3.0,4.0,0.5225328,0.03473028,-0.632268,1.77903,2.406598
44,by eye,1795,124.0,84.0,84.0,16.0,9.269836e-10,0.009114118,-0.941813,-0.925146,0.014461
1496,dock,4571,255.0,188.0,187.0,23.0,3.2379639999999996e-26,1.330122e-10,-1.129183,-1.757023,-0.624905
2744,enumerate,127,25.0,4.0,4.0,1.0,0.02486778,0.7270644,-1.512969,-1.097099,0.406598


In [124]:
import shutil, os
from git import Repo
shutil.copy('rationale analysis.ipynb', f'{gitrepo_folder}/rationale_analysis.ipynb')

repo = Repo(gitrepo_folder)
repo.index.add(os.listdir(gitrepo_folder))   
repo.index.commit(':truck: data')
repo.remotes.origin.push()

NameError: name 'shutils' is not defined