In [1]:
from wiselydock.database import Master
from wiselydock.learning import ActiveLearningInstance, Batch, Iterations, ActiveLearningModel

from functools import wraps
from time import time
from tqdm import tqdm

import pandas as pd
import numpy as np

In [2]:
m = Master.from_json('/storage/marinegor/github/wiselydock-server/data/D4_1M/master.json')
instance = ActiveLearningInstance(m)

In [4]:
instance

ActiveLearningInstance(database=Master(extensions=[<Extension.npy: 'npy'>, <Extension.csv: 'csv'>], folder=PosixPath('/storage/marinegor/github/wiselydock-server/data/D4_1M')), add_to_train=False, regime=<Iterations.LastModel: '3'>, n_splits=5, chunksize=10000, model=ActiveLearningModel(regime=<Iterations.LastModel: '3'>, n_splits=5, _regressor_factory=<class 'sklearn.linear_model._base.LinearRegression'>), base_regressor=<class 'sklearn.linear_model._base.LinearRegression'>, history=[])

In [3]:
datasets = {
    'AmpC':'../data/AmpC_screen_table__withscores.csv',
    'D4':'../data/D4_screen_table__withscores.csv',
    '4eiy_1':'../data/4eiy_screen_table__withscores.csv',
    '4eiy_2':'../data/4eiy_screen_table__withscores_2.csv',
    '5zty':'../data/5zty_screen_table__withscores.csv',
    'D4_1M':'/storage/marinegor/github/wiselydock-server/data/D4_1M/D4_screen_table__unique_dropna_213_nonempty_filtered_1M.csv.source'
           }

# dfs = []
# for name, path in datasets.items():
#     df = dd.read_csv(path)
#     df['proj'] = name
#     dfs.append(df)

# raw_data = dd.concat(dfs).compute(num_workers = 4)

In [4]:
def get_scores(name: str, nrows: int = int(1e6)) -> Batch:
    path = datasets.get(name)
    if not path:
        raise ValueError(f'Dataset should be from {list(datasets.keys())}')
    
    df = pd.read_csv(path, nrows=nrows*1.1,).dropna(axis=0).head(nrows)
    df.columns = ['smiles','zincid','dockscore']
    return df

In [5]:
df = get_scores('D4_1M')

In [6]:
df.head()

Unnamed: 0,smiles,zincid,dockscore
0,C=CCc1ccc(OCC(=O)N(CC)CC)c(OC)c1,ZINC000000000007,-24.85
1,O=C(C[S@@](=O)C(c1ccccc1)c1ccccc1)NO,ZINC000000000012,-26.71
2,O[C@@H]1C=C2CCN3Cc4cc5c(cc4[C@@H]([C@H]23)[C@H...,ZINC000000000024,-32.09
3,CC(C)(C)NC[C@@H](O)COc1ccccc1NC(=O)c1ccco1,ZINC000000000039,-54.0
4,CC(=O)c1cc2cccc(OC[C@H](O)CNC(C)C)c2o1,ZINC000000000063,-39.95


In [7]:
def get_scores_for_ids(data: pd.DataFrame, ids: list[str]) -> list[float]:
    ids = ids
    scores = data[data.zincid.isin(ids)].dockscore.values
    return scores

def get_random_scores(data: pd.DataFrame, size: int = 10_000) -> Batch:
    tmp = data.sample(n=size)
    ids, scores = tmp.zincid.values, tmp.dockscore.values
    data = list(zip(ids, scores))
    return data

In [8]:
def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print ('func:%r took: %2.4f sec' % \
          (f.__name__, te-ts))
        return result
    return wrap

@timing
def init_model(att: bool, r: Iterations) -> ActiveLearningInstance:
    instance_init = ActiveLearningInstance(m, regime=r, add_to_train=att)
    instance_init.dump("instance.json")
    instance = ActiveLearningInstance.from_json("instance.json")
    return instance

@timing
def initialization(instance: ActiveLearningInstance, data: pd.DataFrame):
    ids, scores = zip(*get_random_scores(data))
    data = zip(ids, scores)
    instance.add_data(data)

@timing
def do_cycle(instance: ActiveLearningInstance, i: int, data: pd.DataFrame):
    batch = instance.get_next_batch()
    batch_scores = get_scores_for_ids(data=data, ids=batch)
    data = zip(batch, batch_scores)
    instance.add_data(data)
    instance.dump(f'instance_{i}.json')

# for att in (True, False):
#     for r in Iterations:
#         instance = init_model(att, r)
#         initialization(instance)
#         for i in range(5):
#             do_cycle(instance, i)
#         print('-'*80)

In [39]:
rm *.{json,pkl}

rm: cannot remove '*.pkl': No such file or directory


In [40]:
# Long running cell
import multiprocessing

def learning_call():
    n = 25
    df = get_scores('D4_1M')
    instance = init_model(att=False, r=Iterations.LastModel)
    initialization(instance, data=df)
    for i in tqdm(range(n)):
        do_cycle(instance, i, data=df)
    
multiprocessing.Process(target=learning_call).start()

func:'init_model' took: 0.0056 sec
func:'initialization' took: 18.9163 sec


  0%|          | 0/25 [00:00<?, ?it/s]

func:'do_cycle' took: 87.9371 sec


  4%|▍         | 1/25 [01:27<35:10, 87.94s/it]

func:'do_cycle' took: 120.3059 sec


  8%|▊         | 2/25 [03:28<41:00, 106.98s/it]

func:'do_cycle' took: 145.4856 sec


 12%|█▏        | 3/25 [05:53<45:40, 124.57s/it]

func:'do_cycle' took: 174.6539 sec


 16%|█▌        | 4/25 [08:48<50:31, 144.34s/it]

func:'do_cycle' took: 207.8620 sec


 20%|██        | 5/25 [12:16<55:44, 167.25s/it]

func:'do_cycle' took: 237.5954 sec


 24%|██▍       | 6/25 [16:13<1:00:32, 191.17s/it]

func:'do_cycle' took: 342.2523 sec


 28%|██▊       | 7/25 [21:56<1:12:10, 240.56s/it]

func:'do_cycle' took: 815.5366 sec


 32%|███▏      | 8/25 [35:31<2:00:01, 423.61s/it]

func:'do_cycle' took: 630.1547 sec


 36%|███▌      | 9/25 [46:01<2:10:10, 488.18s/it]

func:'do_cycle' took: 550.2984 sec


 40%|████      | 10/25 [55:12<2:06:50, 507.36s/it]

func:'do_cycle' took: 619.6919 sec


 44%|████▍     | 11/25 [1:05:31<2:06:24, 541.74s/it]

func:'do_cycle' took: 776.7063 sec


 48%|████▊     | 12/25 [1:18:28<2:12:51, 613.22s/it]

func:'do_cycle' took: 716.0547 sec


 52%|█████▏    | 13/25 [1:30:24<2:08:52, 644.38s/it]

func:'do_cycle' took: 784.1215 sec


 56%|█████▌    | 14/25 [1:43:28<2:05:52, 686.59s/it]

func:'do_cycle' took: 900.8236 sec


 60%|██████    | 15/25 [1:58:29<2:05:11, 751.17s/it]

func:'do_cycle' took: 1561.1122 sec


 64%|██████▍   | 16/25 [2:24:30<2:29:14, 994.96s/it]

func:'do_cycle' took: 1269.0342 sec


 68%|██████▊   | 17/25 [2:45:39<2:23:39, 1077.38s/it]

func:'do_cycle' took: 1363.4846 sec


 72%|███████▏  | 18/25 [3:08:23<2:15:43, 1163.35s/it]

func:'do_cycle' took: 1236.9527 sec


 76%|███████▌  | 19/25 [3:29:00<1:58:32, 1185.46s/it]

func:'do_cycle' took: 1506.3500 sec


 80%|████████  | 20/25 [3:54:06<1:46:49, 1281.80s/it]

func:'do_cycle' took: 1616.9362 sec


 84%|████████▍ | 21/25 [4:21:03<1:32:09, 1382.40s/it]

func:'do_cycle' took: 1864.2506 sec


 88%|████████▊ | 22/25 [4:52:07<1:16:21, 1527.02s/it]

In [9]:
def retrieve_top(name: str, share: float = 1e-2) -> list[str]:
    assert 0 < share < 1, share
    scores = get_scores(name)
    size = scores.shape[0]
    top_size = int(size*share)
    rv = scores.sort_values('zincid')[:top_size].zincid.values
    return rv

from more_itertools import flatten

def recall_score_for_instance(inst: instance, top: list[str]) -> float:
    seen = set([pair[0] for pair in flatten(inst.history)])
    known = set(top)
    return len(known & seen) / len(known)

def recall_score_for_many_instances(instances: list[ActiveLearningInstance], top: list[str]) -> float:
    histories = [i.history for i in instances]
    histories = list(flatten(histories))
    seen = set([pair[0] for pair in flatten(histories)])
    known = set(top)
    return len(known & seen) / len(known)

def mean_score_for_instance(inst: instance) -> float:
    scores = set([tuple(pair) for pair in flatten(inst.history)])
    scores = [elem[1] for elem in scores]
    return sum(scores) / len(scores)

def mean_score_for_many_instances(instances: list[ActiveLearningInstance]) -> float:
    histories = [i.history for i in instances]
    histories = list(flatten(histories))
    scores = set([tuple(pair) for pair in flatten(histories)])
    scores = [elem[1] for elem in scores]
    return sum(scores) / len(scores)

top = retrieve_top('D4_1M')

In [10]:
import glob

instances = [ActiveLearningInstance.from_json(path) for path in glob.glob('instance_*.json')]
print(f'{len(instances)=}')
recall_score_for_many_instances(instances, top)

len(instances)=25


0.2475

In [11]:
mean_score_for_many_instances(instances)

-27.987484384614678

In [12]:
ls

Figure_1_dataset_description.ipynb  instance_21.json
Figure_2_performance.ipynb          instance_22.json
Figure_3_iterations.ipynb           instance_23.json
Figure_5_normalization.ipynb        instance_24.json
instance_0.json                     instance_2.json
instance_10.json                    instance_3.json
instance_11.json                    instance_4.json
instance_12.json                    instance_5.json
instance_13.json                    instance_6.json
instance_14.json                    instance_7.json
instance_15.json                    instance_8.json
instance_16.json                    instance_9.json
instance_17.json                    instance.json
instance_18.json                    large_screening_EM.ipynb
instance_19.json                    [0m[01;36mmargarita_notebooks[0m@
instance_1.json                     reproduce_in_package.ipynb
instance_20.json                    Tables_visualisation.ipynb
