In [1]:
import h5py
from tqdm import tqdm
import sqlite3
#import sqlite_utils as su

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('~/msmsgym/MSNovelist-private')
import os

import fp_management.database as db
import itertools
import uuid
import fp_management.mist_fingerprinting as fpr
import fp_management.fingerprint_map as fpm
import smiles_config as sc
import pickle

from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
db_crossval = "/home/stravsmi/msmsgym/msnovelist-data-0531/msnovelist_crossval.hdf5"
db_train = "/home/stravsmi/msmsgym/msnovelist-data-0531/msnovelist_train.hdf5"

# db_old = "/sirius6_db/combined_0824_v44.db"
db_uuid = uuid.uuid4()
db_new = f"/home/stravsmi/msmsgym/msnovelist-data-0531/mist-val-{db_uuid}.db"

h5_crossval = h5py.File(db_crossval, mode='r')
h5_train = h5py.File(db_train, mode='r')

PROCESSING_BLOCK_SIZE=40000
PROCESSING_BLOCK_MAX_COUNT=9999999999

In [3]:
%autoreload
fpr.MistFingerprinter.init_instance()
fingerprinter = fpr.MistFingerprinter.get_instance()

In [4]:
def try_fp_item(smiles_generic, smiles_canonical, fp, fold):
    try:
        item = db.FpItem.fromFingerprint(
            smiles_generic = smiles_generic,
            smiles_canonical = smiles_canonical,
            fp = fp,
            source = "dataset",
            grp = fold,
            b64 = False
        )
        item.fp_degraded = fp
    except:
        item = None
    return item

In [5]:

def db_item_block(block):
    smiles, fold_ = zip(*block)
    smiles_proc = fingerprinter.process(smiles, calc_fingerprint=True)
    fp = [x["fingerprint"] for x in smiles_proc]
    fold = [x.decode() for x in fold_]
    #print(str(fold))
    item = zip(smiles_proc, fp, fold)
    fp_items_ = [try_fp_item(s['smiles_generic'], s['smiles_canonical'], fp, fold)
                 for s, fp, fold in item ]
    fp_items = [x for x in fp_items_ if x is not None]
    return fp_items

In [6]:
data_in = zip(h5_crossval["smiles"], h5_crossval["fold"])
data_iter = iter(data_in)

def take(n, iterable): 
    return list(itertools.islice(iterable, n))

print(f"database: {db_new}")
fp_db = db.FpDatabase.load_from_config(db_new)
block = take(PROCESSING_BLOCK_SIZE, data_iter)

database: /home/stravsmi/msmsgym/msnovelist-data-0531/mist-val-3641eb4a-7b25-461d-aba9-edf7cbd2b12d.db


In [7]:

processed_blocks = 0
while (len(block) > 0) and (processed_blocks < PROCESSING_BLOCK_MAX_COUNT):
    print(f"Processing block {processed_blocks}")
    selected_elements = len(block)
    data_proc = db_item_block(block)
    print(f"Loaded {len(block)} elements, "
          f"selected {selected_elements} elements, "
          f"successfully processed {len(data_proc)} elements")
    fp_db.insert_fp_multiple(data_proc)
    #print(f"last inserted id: {inserted_id}")
    block = take(PROCESSING_BLOCK_SIZE, data_iter)
    processed_blocks = processed_blocks + 1
print("Done.")


Processing block 0
Loaded 40000 elements, selected 40000 elements, successfully processed 40000 elements
Processing block 1
Loaded 40000 elements, selected 40000 elements, successfully processed 40000 elements
Processing block 2
Loaded 40000 elements, selected 40000 elements, successfully processed 40000 elements
Processing block 3
Loaded 40000 elements, selected 40000 elements, successfully processed 40000 elements
Processing block 4
Loaded 40000 elements, selected 40000 elements, successfully processed 40000 elements
Processing block 5
Loaded 33446 elements, selected 33446 elements, successfully processed 33446 elements
Done.
