In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import numpy as np
import rdkit
import pandas as pd

import glob
from pathlib import Path

from multiprocessing import Pool

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

from typing import Callable

import sklearn
from sklearn.linear_model import LinearRegression

from collections import defaultdict

import itertools
from datetime import datetime

# Add fingerprints to the molecules

In [3]:
slowdata = Path('modelling/2022-06-24_Ampc_D4_datasets')
path = Path(f'{slowdata}/AmpC_screen_table.csv.gz')

chunksize = 1e5

In [4]:
def smi2mol(smi: str) -> rdkit.Chem.rdchem.Mol:
    return Chem.MolFromSmiles(smi)

def mol2fps(mol: rdkit.Chem.rdchem.Mol) -> np.ndarray:
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    arr = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp,arr)
    return arr

def add_fp(chunk: pd.DataFrame) -> pd.DataFrame:
    chunk['fp'] = chunk.apply({'smiles': lambda smi: mol2fps(smi2mol(smi))}, axis=0).smiles
    return chunk

def write_chunk_to_disk(pair: tuple[int, pd.DataFrame]):
    idx, chunk = pair
    fout_name = f'{slowdata}/AmpC/chunk_{idx:06d}.parq'
    chunk.to_parquet(fout_name)
    return Path(fout_name)

def add_fp_and_write(pair: tuple[int, pd.DataFrame]) -> Path:
    idx, chunk = pair
    new_chunk = add_fp(chunk)
    return write_chunk_to_disk((idx, new_chunk))

def large_csv_to_parquets_with_fps(path: Path, nproc=64) -> list[Path]:
    with Pool(processes=nproc) as pool:
        iterable = pd.read_csv(path, index_col='zincid', chunksize=chunksize)
        iterable = enumerate(iterable)
        path_list = pool.map(add_fp_and_write, iterable)
    return path_list

# Save fingerprints to folder in chunks

In [7]:
# %%time

# large_csv_to_parquets_with_fps(path)

CPU times: user 3min 26s, sys: 28.6 s, total: 3min 55s
Wall time: 15min 16s


[PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000000.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000001.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000002.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000003.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000004.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000005.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000006.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000007.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000008.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000009.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000010.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000011.parq'),
 PosixPath('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000012.parq'),
 PosixPath('

# Train simple models

In [5]:
def train_model(chunk: pd.DataFrame) -> sklearn.base.RegressorMixin:
    # split by X and y
    X = np.stack(chunk.fp.values)
    y = chunk.dockscore.values
    
    # train, optionally in num_folds
    model = LinearRegression()
    model.fit(X, y)
    
    # return model 
    return model

def apply_model_on_chunk(pair: tuple[sklearn.base.RegressorMixin, pd.DataFrame]) -> pd.DataFrame:
    model, chunk = pair
    X = np.stack(chunk.fp.values)
    y_pred = model.predict(X)
    rv = pd.DataFrame(data = y_pred, index=chunk.index, columns=['dockscore_pred'])
    
    return rv


def apply_coef_on_chunk(triplet: tuple[np.ndarray, float, pd.DataFrame]) -> pd.DataFrame:
    print(f'Current Time = {datetime.now().strftime("%H:%M:%S")}')
    
    coef, itercept, chunk = triplet
    X = np.stack(chunk.fp.values)
    y_pred = np.sum(coef*X, axis=1) + itercept
    rv = pd.DataFrame(data = y_pred, index=chunk.index, columns=['dockscore_pred'])
    
    return rv

def apply_coef_on_parquet(triplet: tuple[np.ndarray, float, Path]) -> pd.DataFrame:
    print(f'Current Time = {datetime.now().strftime("%H:%M:%S")}')
    
    coef, itercept, path = triplet
    chunk = pd.read_parquet(path)

    X = np.stack(chunk.fp.values)
    y_pred = np.sum(coef*X, axis=1) + itercept
    rv = pd.DataFrame(data = y_pred, index=chunk.index, columns=['dockscore_pred'])
    
    return rv

# Estimate time for inference

In [6]:
%%time

chunk = pd.read_parquet('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000000.parq')
model = train_model(chunk)

coef = model.coef_
intercept = model.intercept_

CPU times: user 7min 56s, sys: 6min 25s, total: 14min 22s
Wall time: 24 s


In [86]:
%%time

iterable = sorted(glob.glob(f'{slowdata}/AmpC/*'))
print(f'Total number of chunks: {len(iterable)}')
iterable = zip(itertools.repeat(coef), itertools.repeat(intercept), iterable)

with Pool(16) as pool:
    df_list = pool.map(apply_coef_on_parquet, iterable)

Total number of chunks: 995
Current Time = 02:03:50Current Time = 02:03:50Current Time = 02:03:50Current Time = 02:03:50Current Time = 02:03:50Current Time = 02:03:50Current Time = 02:03:50
Current Time = 02:03:50Current Time = 02:03:50
Current Time = 02:03:50




Current Time = 02:03:50Current Time = 02:03:50Current Time = 02:03:50Current Time = 02:03:50






Current Time = 02:03:50Current Time = 02:03:50

Current Time = 02:03:53
Current Time = 02:03:53
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:54
Current Time = 02:03:55
Current Time = 02:03:55
Current Time = 02:03:55
Current Time = 02:03:57
Current Time = 02:03:57
Current Time = 02:03:57
Current Time = 02:03:58
Current Time = 02:03:58
Current Time = 02:03:58
Current Time = 02:03:58
Current Time = 02:03:58
Current Time

Current Time = 02:06:30
Current Time = 02:06:30
Current Time = 02:06:30
Current Time = 02:06:30
Current Time = 02:06:30
Current Time = 02:06:30
Current Time = 02:06:30
Current Time = 02:06:31
Current Time = 02:06:31
Current Time = 02:06:31
Current Time = 02:06:31
Current Time = 02:06:32
Current Time = 02:06:32
Current Time = 02:06:32
Current Time = 02:06:33
Current Time = 02:06:33
Current Time = 02:06:33
Current Time = 02:06:33
Current Time = 02:06:33
Current Time = 02:06:33
Current Time = 02:06:33
Current Time = 02:06:34
Current Time = 02:06:34
Current Time = 02:06:34
Current Time = 02:06:35
Current Time = 02:06:35
Current Time = 02:06:35
Current Time = 02:06:35
Current Time = 02:06:36
Current Time = 02:06:36
Current Time = 02:06:36
Current Time = 02:06:36
Current Time = 02:06:37
Current Time = 02:06:37
Current Time = 02:06:37
Current Time = 02:06:37Current Time = 02:06:37

Current Time = 02:06:37
Current Time = 02:06:37
Current Time = 02:06:38
Current Time = 02:06:38
Current Time = 0

In [87]:
%%time

rv = pd.concat(df_list)

CPU times: user 1.07 s, sys: 4.02 s, total: 5.09 s
Wall time: 5.04 s


In [88]:
%%time

rv.to_parquet(f'{slowdata}/ampc_pred.parq')

CPU times: user 1min 51s, sys: 10.2 s, total: 2min 1s
Wall time: 2min


In [90]:
1e6 / 1e4 * 4 # minutes for iterations up to 1 mln compounds

400.0

# Try to speed it up

let's see how much time will go just on reading all the parquets

## Save ZINC IDs as integers, and omit smiles

In [7]:
def id2num(s: str) -> int:
    return int(s[5:])

In [47]:
def trim_strings(fin: Path) -> Path:
    # load file
    fin = fin.resolve()
    chunk = pd.read_parquet(fin, columns=['zincid','dockscore','fp'])
    
    # optimize the file
    ids = chunk.index.map(id2num)
    chunk.index = ids
    
    # save updated parquet
    rv = f'{fin.parent}/{fin.stem}_short{fin.suffix}'
    rv = Path(rv)
    chunk.to_parquet(rv)
    
    # extract X matrix
    ids = ids.values
    X = np.stack(chunk.fp.values)
#     ids_X = np.hstack([ids.reshape(-1,1), X])
    ids_X = np.array(X, dtype=bool)
    
    # save npy format
    rv_npy = f'{fin.parent}/{fin.stem}.npy'
    np.save(rv_npy, ids_X)
    
    return rv

In [48]:
%%time

trim_strings(Path('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000000.parq'))

CPU times: user 5.29 s, sys: 2.05 s, total: 7.34 s
Wall time: 7.75 s


PosixPath('/storage/marinegor/modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000000_short.parq')

In [53]:
iterable = [Path(p) for p in sorted(glob.glob(f'{slowdata}/AmpC/chunk_??????.parq'))]

In [57]:
%%time

with Pool(8) as pool:
    _ = pool.map(trim_strings, iterable)

CPU times: user 86.7 ms, sys: 374 ms, total: 461 ms
Wall time: 15min 5s


In [63]:
%%time

x = np.load('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000002.npy')

CPU times: user 8.33 ms, sys: 99 ms, total: 107 ms
Wall time: 2.79 s


In [66]:
%%time

x = pd.read_parquet('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000002_short.parq')
X = np.stack(x.fp.values)

CPU times: user 1.98 s, sys: 938 ms, total: 2.92 s
Wall time: 5.41 s


In [209]:
def apply_coef_on_npy(triplet: tuple[np.ndarray, float, Path]) -> Path:
    # print(f'Current Time = {datetime.now().strftime("%H:%M:%S")}')
    
    coef, itercept, path = triplet
    X = np.load(path)
    y_pred = np.sum(coef*X, axis=1) + itercept
    
    k = y_pred.shape[0]//100
    top_k = np.argpartition(z, -k)[-k:]
    
    rv = (path, top_k)
    return rv


In [93]:
%%time

_ = apply_coef_on_npy(
    (
        coef, intercept, Path('modelling/2022-06-24_Ampc_D4_datasets/AmpC/chunk_000002.npy')
    )
)

Current Time = 12:31:53
CPU times: user 298 ms, sys: 234 ms, total: 532 ms
Wall time: 530 ms


In [105]:
1000*0.5 // 60

8.0

In [106]:
%%time

iterable = [Path(p) for p in sorted(glob.glob(f'{slowdata}/AmpC/chunk_??????.npy'))]
iterable = zip(itertools.repeat(coef), itertools.repeat(intercept), iterable)

with Pool(4) as pool:
    rv = pool.map(apply_coef_on_npy, iterable)

Current Time = 13:02:55Current Time = 13:02:55Current Time = 13:02:55Current Time = 13:02:55



Current Time = 13:02:55
Current Time = 13:02:55
Current Time = 13:02:55
Current Time = 13:02:55
Current Time = 13:02:55
Current Time = 13:02:56
Current Time = 13:02:56
Current Time = 13:02:56
Current Time = 13:02:56
Current Time = 13:02:56
Current Time = 13:02:56
Current Time = 13:02:56
Current Time = 13:02:56
Current Time = 13:02:57
Current Time = 13:02:57
Current Time = 13:02:57
Current Time = 13:02:57
Current Time = 13:02:57
Current Time = 13:02:58
Current Time = 13:02:58
Current Time = 13:02:58
Current Time = 13:02:58
Current Time = 13:02:58
Current Time = 13:02:58
Current Time = 13:02:58
Current Time = 13:02:58
Current Time = 13:02:59
Current Time = 13:02:59
Current Time = 13:02:59
Current Time = 13:02:59
Current Time = 13:02:59
Current Time = 13:02:59
Current Time = 13:02:59
Current Time = 13:03:00
Current Time = 13:03:00
Current Time = 13:03:00
Current Time = 13:03:00
Current Time = 1

Current Time = 13:04:26
Current Time = 13:04:26
Current Time = 13:04:26
Current Time = 13:04:26
Current Time = 13:04:26
Current Time = 13:04:27Current Time = 13:04:27

Current Time = 13:04:27
Current Time = 13:04:27
Current Time = 13:04:27
Current Time = 13:04:27
Current Time = 13:04:27
Current Time = 13:04:27
Current Time = 13:04:27
Current Time = 13:04:28
Current Time = 13:04:28
Current Time = 13:04:28
Current Time = 13:04:28
Current Time = 13:04:28
Current Time = 13:04:28
Current Time = 13:04:28Current Time = 13:04:28

Current Time = 13:04:29
Current Time = 13:04:29
Current Time = 13:04:29
Current Time = 13:04:29
Current Time = 13:04:29
Current Time = 13:04:29
Current Time = 13:04:30
Current Time = 13:04:30
Current Time = 13:04:30
Current Time = 13:04:30
Current Time = 13:04:30
Current Time = 13:04:30
Current Time = 13:04:30
Current Time = 13:04:30
Current Time = 13:04:31
Current Time = 13:04:31
Current Time = 13:04:31
Current Time = 13:04:31
Current Time = 13:04:31
Current Time = 1

# Imitate the large screening

In [5]:
ls modelling/2022-06-24_Ampc_D4_datasets/AmpC

chunk_000000.npy         chunk_000331_short.parq  chunk_000663.parq
chunk_000000.parq        chunk_000332.npy         chunk_000663_short.parq
chunk_000000_short.parq  chunk_000332.parq        chunk_000664.npy
chunk_000001.npy         chunk_000332_short.parq  chunk_000664.parq
chunk_000001.parq        chunk_000333.npy         chunk_000664_short.parq
chunk_000001_short.parq  chunk_000333.parq        chunk_000665.npy
chunk_000002.npy         chunk_000333_short.parq  chunk_000665.parq
chunk_000002.parq        chunk_000334.npy         chunk_000665_short.parq
chunk_000002_short.parq  chunk_000334.parq        chunk_000666.npy
chunk_000003.npy         chunk_000334_short.parq  chunk_000666.parq
chunk_000003.parq        chunk_000335.npy         chunk_000666_short.parq
chunk_000003_short.parq  chunk_000335.parq        chunk_000667.npy
chunk_000004.npy         chunk_000335_short.parq  chunk_000667.parq
chunk_000004.parq        chunk_000336.npy         chunk_000667_short.parq
chunk_00

What do we have now:

 - `train_model` -- trains model on a pandas dataframe with `dockscore` and `fp` columns
 - `apply_coef_on_npy` -- applies linear coefficients on a single npy file

Pseudocode of what we want to do:

```python

model = initialize_random_model()

for n in range(num_iterations):
    chunk = select_chunk_based_on_model(model)
    model = train_model(chunk)
    do_logging(chunk, model)
```

In [6]:
from tqdm.notebook import tqdm

In [7]:
from sklearn.model_selection import KFold
from sklearn.ensemble import StackingRegressor, VotingRegressor

In [12]:
from functools import wraps
from time import time

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print(f'func: {f.__name__}, took: {te-ts:2.4f}s')
        return result
    return wrap

class LinearPredictionModel(LinearRegression):
    """
    This model is for prediction only.  It has no fit method.
    You can initialize it with fixed values for coefficients 
    and intercepts.  

    Parameters
    ----------
    coef, intercept : arrays
        See attribute descriptions below.

    Attributes
    ----------
    coef_ : array of shape (n_features, ) or (n_targets, n_features)
        Coefficients of the linear model.  If there are multiple targets
        (y 2D), this is a 2D array of shape (n_targets, n_features), 
        whereas if there is only one target, this is a 1D array of 
        length n_features.
    intercept_ : float or array of shape of (n_targets,)
        Independent term in the linear model.
        
    a modification of: https://stackoverflow.com/questions/61491678/how-to-instantiate-a-scikit-learn-linear-model-with-known-coefficients-without-f
    """

    def __init__(self, coef: np.ndarray, intercept: float):
        assert isinstance(coef, np.ndarray)
        assert isinstance(intercept, float)
        self.intercept_ = intercept
        self.coef_ = coef

    def fit(self, X, y):
        """This model does not have a fit method."""
        raise NotImplementedError("model is only for prediction")

@timing
def initialize_random_model(shape=(2048,)) -> LinearRegression:
    coef = np.random.normal(size=shape)
    intercept = np.random.normal(1)
    model = LinearPredictionModel(coef=coef, intercept=intercept)
    return model

from typing import Optional, Iterable

@timing
def train_model(chunk: pd.DataFrame, n_folds: Optional[int] = 5) -> sklearn.base.RegressorMixin:
    # split by X and y
    X = np.stack(chunk.fp.values)
    y = chunk.dockscore.values
    
    # train, optionally in num_folds
    kf = KFold(n_splits=n_folds)
    models = []
    for i, (ixt, ixv) in enumerate(kf.split(X, y)):
        Xt, yt = X[ixt], y[ixt]
        Xv, yv = X[ixv], y[ixv]
        model = LinearRegression()
        model.fit(Xt, yt)
        models.append((str(i), model))
    
    final_model = VotingRegressor(models)
    final_model.fit(X, y)
    return final_model


def apply_coef_on_npy(triplet: tuple[np.ndarray, float, Path]) -> Path:
    # print(f'Current Time = {datetime.now().strftime("%H:%M:%S")}')
    
    coef, itercept, path = triplet
    X = np.load(path)
    y_pred = np.sum(coef*X, axis=1) + itercept
    
    k = y_pred.shape[0]//50
    # top_k = np.argpartition(y_pred, -k)[-k:]
    top_k = np.argpartition(y_pred, k)[:k]
    top_k_scores = y_pred[top_k]
    
    rv = (path, top_k, top_k_scores)
    return rv

def apply_model_on_npy(pair: tuple[sklearn.base.RegressorMixin, Path]) -> Path:
    # print(f'Current Time = {datetime.now().strftime("%H:%M:%S")}')
    
    model, path = pair
    X = np.load(path)
    y_pred = model.predict(X)
    
    k = y_pred.shape[0]//100
    # top_k = np.argpartition(y_pred, -k)[-k:]
    top_k = np.argpartition(y_pred, k)[:k]
    top_k_scores = y_pred[top_k]
    
    rv = (path, top_k, top_k_scores)
    return rv


@timing
def select_chunk_based_on_model(model: LinearRegression, 
                                iterable: Optional[Iterable]=None) -> list[tuple[Path, np.ndarray, np.ndarray]]:
    if iterable is None:
        iterable = [Path(p) for p in sorted(glob.glob(f'{slowdata}/AmpC/chunk_??????.npy'))]
    total = len(iterable)
    iterable = zip(itertools.repeat(model), iterable)
    
    with Pool(32) as pool:
        rv = list(tqdm(pool.imap(apply_model_on_npy, iterable), total=total, desc='applying model'))

    return rv


@timing
def select_chunk_based_on_coef(model: LinearRegression, 
                                iterable: Optional[Iterable]=None) -> list[tuple[Path, np.ndarray, np.ndarray]]:
    coef = model.coef_
    intercept = model.intercept_
    
    if iterable is None:
        iterable = [Path(p) for p in sorted(glob.glob(f'{slowdata}/AmpC/chunk_??????.npy'))]
    total = len(iterable)
    iterable = zip(itertools.repeat(coef), itertools.repeat(intercept), iterable)
    
    with Pool(32) as pool:
        rv = list(tqdm(pool.imap(apply_coef_on_npy, iterable), total=total, desc='applying model'))

    return rv

@timing
def merge_tops_from_chunks(
    triplets: list[tuple[Path, np.ndarray, np.ndarray]],
    top_N: int = 10_000
                           ) -> list[tuple[Path, np.ndarray, np.ndarray]]:
    names = np.hstack([[elem[0]]*len(elem[1]) for elem in triplets])
    indices = np.hstack([elem[1] for elem in triplets])
    scores = np.hstack([elem[2] for elem in triplets])
    
    top_N = min(top_N, scores.shape[0]-1)
    top_scores = np.argpartition(scores, top_N)
    top_scores = top_scores[:top_N]
    # top_scores = np.argpartition(scores, -top_N)[-top_N:]
    top_indices = indices[top_scores]
    top_names = names[top_scores]
        
    d = defaultdict(list)

    for n, idx in zip(top_names, top_indices):
        d[n].append(idx)
    
    return d
    

def name_and_idx_to_dataframe(pair: tuple[Path, np.ndarray]) -> pd.DataFrame:
    name, idx = pair
    df = pd.read_parquet(f"{name.parent}/{name.stem}.parq")
    df = df.iloc[idx]
    return df
    

@timing
def imitate_docking_of_chunks(chunks: dict[Path, np.ndarray]) -> pd.DataFrame:
    rv_df = pd.DataFrame()
    total = len(chunks.items())
    
    iterable = chunks.items()
    with Pool(8) as pool:
        rv = list(tqdm(pool.imap(name_and_idx_to_dataframe, iterable), total=total, desc='"docking" top N'))
    rv_df = pd.concat(rv)
    rv_df = rv_df[rv_df.dockscore.apply(lambda x: isinstance(x,float))]
    return rv_df


## Single iteration (for testing)

In [9]:
short_iterable = [Path(p) for p in sorted(glob.glob(f'{slowdata}/AmpC_small/chunk_??????.npy'))]
model = initialize_random_model()

rv = select_chunk_based_on_model(model=model, iterable=short_iterable)
chunks = merge_tops_from_chunks(rv)
df = imitate_docking_of_chunks(chunks)

func: initialize_random_model, took: 0.0127s


applying model:   0%|          | 0/10 [00:00<?, ?it/s]

func: select_chunk_based_on_model, took: 8.8803s
func: merge_tops_from_chunks, took: 0.0113s


"docking" top N:   0%|          | 0/10 [00:00<?, ?it/s]

func: imitate_docking_of_chunks, took: 7.0117s


In [10]:
train_model(df)

func: train_model, took: 37.2447s


In [11]:
m = _

# Main iteration loop

In [62]:
96214207 // 100 // 10_000

96

In [None]:
num_iterations = 960
model = initialize_random_model()

for n in tqdm(range(num_iterations)):
    rv = select_chunk_based_on_model(model=model)
    chunks = merge_tops_from_chunks(rv, top_N=100_000)
    df = imitate_docking_of_chunks(chunks)
    model = train_model(df)
    df.to_parquet(f'{slowdata}/iteration_{n}.parq')

func: initialize_random_model, took: 0.0016s


  0%|          | 0/960 [00:00<?, ?it/s]

applying model:   0%|          | 0/995 [00:00<?, ?it/s]

func: select_chunk_based_on_model, took: 206.3026s
func: merge_tops_from_chunks, took: 0.8277s


"docking" top N:   0%|          | 0/995 [00:00<?, ?it/s]

func: imitate_docking_of_chunks, took: 318.1694s
func: train_model, took: 141.4745s


applying model:   0%|          | 0/995 [00:00<?, ?it/s]

func: select_chunk_based_on_model, took: 480.1135s
func: merge_tops_from_chunks, took: 1.4480s


"docking" top N:   0%|          | 0/995 [00:00<?, ?it/s]

func: imitate_docking_of_chunks, took: 317.9292s
func: train_model, took: 156.0878s


applying model:   0%|          | 0/995 [00:00<?, ?it/s]

func: select_chunk_based_on_model, took: 498.8573s
func: merge_tops_from_chunks, took: 0.8040s


"docking" top N:   0%|          | 0/994 [00:00<?, ?it/s]

func: imitate_docking_of_chunks, took: 332.8860s
func: train_model, took: 148.2194s


applying model:   0%|          | 0/995 [00:00<?, ?it/s]

func: select_chunk_based_on_model, took: 561.1616s
func: merge_tops_from_chunks, took: 0.7899s


"docking" top N:   0%|          | 0/995 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

