In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
from scipy import sparse

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [2]:
#count number of items:
indptr = [0]

for chunkID in range(10):
    scores = np.load(f'../processed_data/AmpC_all{chunkID}.npy')
    indptr.append(indptr[-1] + scores.shape[0])


In [3]:
scores = np.concatenate([np.load(f'../processed_data/AmpC_all{i}.npy') for i in range(10)])

# functions to handle the slabs

In [4]:
def extractFPs(chunkID, indptr, isTrain):
    fp = sparse.load_npz(f'../processed_data/AmpC_all{chunkID}.npz')
    mask = isTrain[indptr[chunkID]:indptr[chunkID+1]]
    return fp[mask]

def buildTrain(indptr, isTrain, verbose=0):
    if verbose:
        print('building training matrix')
    fps = sparse.vstack([extractFPs(i, indptr, isTrain) for i in range(10)])
    return fps

def chunkPredictProba(model, indptr, isTrain, verbose=0):
    if verbose:
        print('predicting probabilities')
    probas = []
    for chunkID in range(10):
        fps = extractFPs(chunkID, indptr, ~isTrain)
        proba = model.predict_proba(fps)[:,1]
        probas.append(proba)
    return np.concatenate(probas)

def chunkPredict(model, indptr, isTrain, verbose=0):
    if verbose:
        print('predicting probabilities')
    preds = []
    for chunkID in range(10):
        fps = extractFPs(chunkID, indptr, ~isTrain)
        pred = -1*model.predict(fps) #best scoring will now be on top (like the proba)
        preds.append(pred)
    return np.concatenate(preds)

# Train and RF regressor and Logistic Regression models

In [5]:
from sklearn.linear_model import LogisticRegression
#from sklearn.linear_model import Ridge
model = LogisticRegression(max_iter=10000, C=0.1)
#model = Ridge()

# How long to find the 50k - 500k top 0.4%?

In [6]:
trainingSetSizes = [5000, 10_000] + [10000*2<<i for i in range(0,8)]
desiredNumLigands = [25_000, 50_000, 100_000, 200_000, 300_000]

In [9]:
np.array(desiredNumLigands) / 0.004 / 60 / 60 / 24


array([ 72.33796296, 144.67592593, 289.35185185, 578.7037037 ,
       868.05555556])

In [17]:
test_cutoff = np.percentile(scores, 0.4)

topK = scores<test_cutoff

#df = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
df = pd.DataFrame(columns=['Algorithm', 'Training size', 'N hits wanted', 'N hits explored'])
count=0

for i in range(3):
    for numWanted in desiredNumLigands:
        idx = np.arange(scores.shape[0])
        np.random.shuffle(idx)

        for size in trainingSetSizes:
            #split indices into train and test:
            train = idx[:size].copy()
            test = idx[size:].copy()
            train.sort()
            test.sort()
    
            #generate a 'is a training instance' mask. 
            isTrain = np.zeros(scores.shape[0]).astype(bool)
            isTrain[train]=True
    
            #topK molecules already found in the training set:
            numFound = topK[train].sum()
            numRequired = numWanted - numFound
            
            #fit model:
            cutoff = np.percentile(scores[isTrain],0.8)
            model.fit(buildTrain(indptr, isTrain, 1), scores[isTrain]<cutoff)

            #predict (slowest step):
            proba = chunkPredictProba(model, indptr, isTrain, 1)
        
            #rank the probabilities
            proba_sorted = (-proba).argsort()
            
            #sorted the unseen instances by probability (highest prob first):
            test = test[proba_sorted]

            #topK molecules already found in the training set:
            numSampled = np.argmax(np.cumsum(topK[test])>numRequired)
            
            df.loc[count] = ['morgan_feat', size, numWanted, numSampled+size]
            count+=1
            print(count, size, numWanted, numSampled+size)
                
            #df.to_csv('../processed_data/AmpC_single_'+str(0.4)+'.csv')


building training matrix
predicting probabilities
1 5000 25000 114732
building training matrix
predicting probabilities
2 10000 25000 106772
building training matrix
predicting probabilities
3 20000 25000 115994
building training matrix
predicting probabilities
4 40000 25000 132108
building training matrix
predicting probabilities
5 80000 25000 163991
building training matrix
predicting probabilities
6 160000 25000 238455
building training matrix
predicting probabilities
7 320000 25000 392069
building training matrix
predicting probabilities
8 640000 25000 703429
building training matrix
predicting probabilities
9 1280000 25000 1334626
building training matrix
predicting probabilities
10 2560000 25000 2599583
building training matrix
predicting probabilities
11 5000 50000 283599
building training matrix
predicting probabilities
12 10000 50000 271329
building training matrix
predicting probabilities
13 20000 50000 240087
building training matrix
predicting probabilities
14 40000 50000 2

predicting probabilities
112 10000 50000 243509
building training matrix
predicting probabilities
113 20000 50000 234265
building training matrix
predicting probabilities
114 40000 50000 237884
building training matrix
predicting probabilities
115 80000 50000 266192
building training matrix
predicting probabilities
116 160000 50000 331464
building training matrix
predicting probabilities
117 320000 50000 480341
building training matrix
predicting probabilities
118 640000 50000 785669
building training matrix
predicting probabilities
119 1280000 50000 1409031
building training matrix
predicting probabilities
120 2560000 50000 2667132
building training matrix
predicting probabilities
121 5000 100000 582095
building training matrix
predicting probabilities
122 10000 100000 531280
building training matrix
predicting probabilities
123 20000 100000 487450
building training matrix
predicting probabilities
124 40000 100000 464802
building training matrix
predicting probabilities
125 80000 1000