In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
from scipy import sparse

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [13]:
#count number of items:
indptr = [0]

for chunkID in range(10):
    scores = np.load(f'../processed_data/AmpC_all{chunkID}.npy')
    indptr.append(indptr[-1] + scores.shape[0])

scores = np.concatenate([np.load(f'../processed_data/AmpC_all{i}.npy') for i in range(10)])

In [110]:
(scores<-60).sum() / scores.shape[0] * 100

1.2983332346819265

In [38]:
180280 / scores.shape[0] * 100

0.18737357960812545

# Plot

In [2]:
import altair as alt

In [3]:
!ls ../processed_data/ampc_recon*

../processed_data/ampc_reconstruction_0.03.csv
../processed_data/ampc_reconstruction_0.05.csv
../processed_data/ampc_reconstruction_0.125.0.8.csv
../processed_data/ampc_reconstruction_0.125.csv
../processed_data/ampc_reconstruction_0.15.csv
../processed_data/ampc_reconstruction_0.175.csv
../processed_data/ampc_reconstruction_0.1.csv
../processed_data/ampc_reconstruction_0.25.csv
../processed_data/ampc_reconstruction_0.5.csv
../processed_data/ampc_reconstruction_0.75.csv
../processed_data/ampc_reconstruction_1.csv
../processed_data/ampc_reconstruction.csv


In [4]:
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.03.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.04.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.05.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.06.csv')
df = pd.read_csv('../processed_data/ampc_reconstruction_0.15.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.15.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.175.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.25.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.5.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_0.75.csv')
#df = pd.read_csv('../processed_data/ampc_reconstruction_1.csv')

In [5]:
prev_results = [['RF (Coley)', 400_000, 71.4, 2.1], ['NN (Coley)', 400_000, 74.7, 1.4],
                ['MPN (Coley)',400_000, 87.9, 2.3],
    ['RF (Coley)', 200_000, 45.5, 1.8],
['NN (Coley)', 200_000, 52.8, 0.5],
['MPN (Coley)', 200_000, 67.1, 2.1],
['RF (Coley)', 100_000, 24.0, 2.2],
['NN (Coley)', 100_000 , 33.3,0.3],
['MPN (Coley)', 100_000, 52.0, 0.5]]


In [6]:


coley = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0 
for res in prev_results:
    
    desired_std_dev = res[3]
    samples = np.array([-1,0,1]).astype(float)
    samples *= (desired_std_dev/np.std(samples))
    for s in samples:
        coley.loc[count]= [res[0], res[1], res[1]*6, (s+res[2])/100]
        count+=1

In [7]:
concat = pd.concat([df, coley])

In [8]:
error_bars = alt.Chart(concat).mark_errorbar(extent='ci').encode(
  x=alt.X('N ligands explored:Q',title='Number of ligands sampled'),
  y=alt.Y('% top-k found:Q', title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

points = alt.Chart(concat).mark_point(filled=True, color='black').encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

line = alt.Chart(concat).mark_line(color='black',size=1,opacity=0.5).encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

ch = (error_bars+points+line).properties(height=400,width=200).facet(
    column=alt.Column('Training size:N',sort=alt.Sort([0.004, 0.002, 0.001])),
).resolve_scale(x='independent')
ch

In [102]:
#df.to_csv('../processed_data/ampc_reconstruction.csv')

In [103]:
chs = []
for frac in [400000, 200000, 100000]:
    
    df_ = concat[concat['Training size']==frac].replace('morgan_feat', 'Morgan pharm. & Log.reg. (ours)')
    error_bars = alt.Chart(df_).mark_errorbar(extent='ci').encode(
          x=alt.X('N ligands explored:Q', 
                  title='Number of ligands sampled',
                  scale=alt.Scale(domain=[0,max(df_['N ligands explored'])+10000])),
          y=alt.Y('% top-k found:Q',scale=alt.Scale(domain=[0,0.95])),
        color=alt.Color('Algorithm')
        )

    points = alt.Chart(df_).mark_point(filled=True, color='black').encode(
          x=alt.X('N ligands explored:Q'),
          y=alt.Y('% top-k found:Q',aggregate='mean',scale=alt.Scale(domain=[0,0.95])),
        color=alt.Color('Algorithm')
        )

    line = alt.Chart(df_).mark_line(color='black',size=1,opacity=0.5).encode(
          x=alt.X('N ligands explored:Q'),
          y=alt.Y('% top-k found:Q',aggregate='mean',scale=alt.Scale(domain=[0,0.95])),
        color=alt.Color('Algorithm')
        )
    ch = (error_bars+points+line).properties(width=200)
    ch.title = str(frac / (100*1e6)*100)
    chs.append(ch)


In [104]:
chs[0]
sup = chs[0] |  chs[1] | chs[2]
sup
#sup.save('../figures/ampC_reconstruction.html') #using 0.05

In [109]:
chs[0]+chs[1]+chs[2]

In [289]:
sup.save('../figures/ampC_reconstruction.html')