In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
from scipy import sparse

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [2]:
#count number of items:
indptr = [0]

for chunkID in range(10):
    scores = np.load(f'../processed_data/AmpC_all{chunkID}.npy')
    indptr.append(indptr[-1] + scores.shape[0])


In [3]:
scores = np.concatenate([np.load(f'../processed_data/AmpC_all{i}.npy') for i in range(10)])

# functions to handle the slabs

For training, these loop through the chunks and extract the indices that have been selected either at random or suggested by the surrogate model. 

For predicting, these loop through the chunks and perform the `predict_proba` method on each chunk (after removing the training indices), outputting a concatenated numpy array of predicted values.

In [4]:
def extractFPs(chunkID, indptr, isTrain):
    fp = sparse.load_npz(f'../processed_data/AmpC_all{chunkID}.npz')
    mask = isTrain[indptr[chunkID]:indptr[chunkID+1]]
    return fp[mask]

def buildTrain(indptr, isTrain, verbose=0):
    if verbose:
        print('building training matrix')
    fps = sparse.vstack([extractFPs(i, indptr, isTrain) for i in range(10)])
    return fps

def chunkPredictProba(model, indptr, isTrain, verbose=0):
    if verbose:
        print('predicting probabilities')
    probas = []
    for chunkID in range(10):
        fps = extractFPs(chunkID, indptr, ~isTrain)
        proba = model.predict_proba(fps)[:,1]
        probas.append(proba)
    return np.concatenate(probas)

def chunkPredict(model, indptr, isTrain, verbose=0):
    if verbose:
        print('predicting probabilities')
    preds = []
    for chunkID in range(10):
        fps = extractFPs(chunkID, indptr, ~isTrain)
        pred = -1*model.predict(fps) #best scoring will now be on top (like the proba)
        preds.append(pred)
    return np.concatenate(preds)

# Train and RF regressor and Logistic Regression models

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=10000, C=1)


# How long to find the 50k - 200k top 0.3% docking scores from one iteration of Logistic regression?

In [6]:
trainingSetSizes = [5000, 10_000] + [10000*2<<i for i in range(0,8)]

num_actual = scores.shape[0] * 0.003

desiredNumLigands = [50_000, 100_000, 150_000, 200_000,]

In [8]:
#this is the _actual_ observed cutoff at 0.3th percentile.
test_cutoff = np.percentile(scores, 0.3)
#mask identifying the top hits.
topK = scores<test_cutoff


#df = pd.DataFrame(columns=['Algorithm', 'Training size', 'Fraction', 'N hits wanted', 'N hits explored'])
df = pd.DataFrame(columns=['Algorithm', 'Training size',  'N hits wanted', 'N hits explored'])
count=0

for i in range(3):
    #for percent in np.array([0.1, 0.25, 0.5, 0.75, 0.9]):
    for numWanted in desiredNumLigands:

        idx = np.arange(scores.shape[0])
        np.random.shuffle(idx)

        for size in trainingSetSizes:
            #numWanted = int(percent * scores.shape[0] * 0.003)
            #print('numWanted:', numWanted, 'percent:', percent)
            #split indices into train and test:
            train = idx[:size].copy()
            test = idx[size:].copy()
            train.sort()
            test.sort()
    
            #generate a 'is a training instance' mask. 
            isTrain = np.zeros(scores.shape[0]).astype(bool)
            isTrain[train]=True
    
            #topK molecules already found in the training set:
            numFound = topK[train].sum()
            numRequired = numWanted - numFound
            
            #fit model:
            cutoff = np.percentile(scores[isTrain],0.3)
            model.fit(buildTrain(indptr, isTrain, 1), scores[isTrain]<cutoff)

            #predict (slowest step):
            proba = chunkPredictProba(model, indptr, isTrain, 1)
        
            #rank the probabilities
            proba_sorted = (-proba).argsort()
            
            #sorted the unseen instances by probability (highest prob first):
            test = test[proba_sorted]

            #topK molecules already found in the training set:
            numSampled = np.argmax(np.cumsum(topK[test])>numRequired)
            
            #df.loc[count] = ['morgan_feat', size, percent, numWanted, numSampled+size]
            df.loc[count] = ['morgan_feat', size,numWanted, numSampled+size]
            count+=1
            print(count, size, numWanted, numSampled+size)
                
            df.to_csv('../processed_data/AmpC_single_'+str(0.3)+'.csv')


building training matrix
predicting probabilities
1 5000 50000 344329
building training matrix
predicting probabilities
2 10000 50000 336375
building training matrix
predicting probabilities
3 20000 50000 324524
building training matrix
predicting probabilities
4 40000 50000 310417
building training matrix
predicting probabilities
5 80000 50000 336111
building training matrix
predicting probabilities
6 160000 50000 380939
building training matrix
predicting probabilities
7 320000 50000 506687
building training matrix
predicting probabilities
8 640000 50000 801795
building training matrix
predicting probabilities
9 1280000 50000 1419876
building training matrix
predicting probabilities
10 2560000 50000 2675204
building training matrix
predicting probabilities
11 5000 100000 925661
building training matrix
predicting probabilities
12 10000 100000 884846
building training matrix
predicting probabilities
13 20000 100000 760983
building training matrix
predicting probabilities
14 40000 1000

building training matrix
predicting probabilities
112 10000 200000 2141253
building training matrix
predicting probabilities
113 20000 200000 2160221
building training matrix
predicting probabilities
114 40000 200000 2008074
building training matrix
predicting probabilities
115 80000 200000 1853942
building training matrix
predicting probabilities
116 160000 200000 1762290
building training matrix
predicting probabilities
117 320000 200000 1751264
building training matrix
predicting probabilities
118 640000 200000 1924092
building training matrix
predicting probabilities
119 1280000 200000 2462034
building training matrix
predicting probabilities
120 2560000 200000 3630749
