In [8]:
import pandas as pd
import numpy as np

# import sklearn as sk
import os

import rdkit.Chem.PandasTools as pt
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")

import multiprocessing as mp
from multiprocessing import Process, Pool
from functools import wraps
import datetime as dt

In [25]:
def loggg(f):
    def wrapper(dataf, *args, **kwargs):
        tic = dt.datetime.now()
        result = f(dataf, *args, **kwargs)
        toc = dt.datetime.now()

        if hasattr(dataf, "shape") and hasattr(result, "shape"):
            share_before = dataf.shape
            shape_after = result.shape
            added_columns = set(result.columns) - set(dataf.columns)
            print(
                f"{f.__name__},  shape {dataf.shape}->{result.shape},  took={toc-tic}"
            )

        else:
            print(f"{f.__name__} took={toc-tic}")

        return result

    return wrapper

In [26]:
@loggg
def load_data(inputpath):
    import os
    import rdkit.Chem.PandasTools as pt

    from rdkit import RDLogger

    RDLogger.DisableLog("rdApp.*")

    dataframes = []

    for (
        path,
        folders,
        files,
    ) in os.walk(inputpath):
        for fle in files:
            if os.path.splitext(fle)[1] == ".sdf":
                df = pt.LoadSDF(f"{path}/{fle}")
                dataframes.append(df)

    return pd.concat(dataframes)


@loggg
def start_pipeline(dataf):
    return dataf.copy()


@loggg
def convert_dtypes(dataf):
    for column in dataf.columns:
        try:
            dataf[column] = dataf[column].astype("float64")
        except Exception as e:
            print(f"\t{column} convertion yielded {e}")
    return dataf


@loggg
def extract_project_name(dataf):
    def _short_project_name(prpath):
        return prpath.rsplit("/", 2)[-2]

    dataf["project_name"] = dataf.transform({"projPath": _short_project_name}, axis=0)
    return dataf

## Fingerprints

#### generation:
rdkit2fps --morgan file_with_smiles.smi -o file_with_fingerprints.fps

#### header
#FPS1
#num_bits=2048
#type=RDKit-Morgan/1 radius=2 fpSize=2048 useFeatures=0 useChirality=0 useBondTypes=1 
#software=RDKit/2018.09.1 chemfp/1.6.1
#source=ZINC20_1_truncated.smi
#date=2021-02-10T16:57:36

Here is the list of paths depending on which fingerprints we want to use:

In [11]:
fps_filename_morgan = "/storage/margaritakovaleva/accelerated_docking/5zty/fingerprints/ZINC20_1_binary.fps"
# fps_filename_morgan_radius3 = '/storage/score_prediction_KM/2021-01-25_fingerprints'
# fps_filename_morgan_radius2_size4096 = '/storage/score_prediction_KM/2021-01-25_fingerprints'
# fps_filename_AP = '/storage/score_prediction_KM/2021-01-25_fingerprints/AP/ZINC20_1.AP_fps'
# docking_results_folder = '/home/margaritakovaleva/score_prediction/2021-04-19_second_docking'
docking_results_folder = (
    "/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty"
)

Specify which ones to choose now:

In [13]:
current_fps_filename = fps_filename_morgan


In [None]:
# def filename_generation(params):

#     general_path = '/storage/score_prediction_KM/2021-01-25_fingerprints/' + str(params[0]) + '/'

#     if params[0] == 'AP':
#         return general_path + 'ZINC20_1.AP_fps'

#     elif params[0] == 'Morgan':
#         if
#     else:
#         print("Wrong parameters")

In [27]:
def load_fingerprints(file_with_fingerprints=current_fps_filename):
    with open(file_with_fingerprints) as fin:

        fps = [elem.split() for elem in fin]

        fps = {
            "fps_hex": [elem[0] for elem in fps],
            "ZincID": [elem[1] for elem in fps],
        }

        fps = pd.DataFrame(fps)
    return fps

In [28]:
def load_fingerprints_with_header(file_with_fingerprints=current_fps_filename):

    with open(file_with_fingerprints) as fin:
        fp = []
        header = []

        for elem in fin:
            if elem[0] != "#":
                fp.append(elem.split())
            else:
                header.append(elem)

        if len(header) > 0:
            # type=RDKit-Morgan/1 radius=3 fpSize=2048 useFeatures=0 useChirality=0 useBondTypes=1
            size = int(header[1].split("=")[1])
            header = header[2].split()
            fps_type = header[0].split("=")[1]

            if fps_type == "RDKit-Morgan/1":
                fps_type = "Morgan"
                radius = header[1].split("=")[1]
                name = f"fps_hex__type={fps_type}__size={size}__radius={radius}"
            elif fps_type == "RDKit-AtomPair/1":
                fps_type = "AtomPair"
                name = f"fps_bin__type={fps_type}__size={size}"
            else:
                print(f"Unknown type of fingerprints:{fps_type}")
                name = f"fps__type={fps_type}__size={size}"

        else:
            print("No header, assume that it is Morgan with radius 2")
            name = "fps_hex__type=Morgan__size=2048__radius=2"
            fps_type = "Morgan"
            size = 2048

        fp = {
            name: [elem[0] for elem in fp],
            "ZincID": [elem[1] for elem in fp],
        }

        fp = pd.DataFrame(fp)
    return fp, size, fps_type, name

In [29]:
# Choose fingerprints: morgan or atom_pairs
fps, fps_size, fps_type, fps_name = load_fingerprints_with_header()

No header, assume that it is Morgan with radius 2


In [40]:
fps, fps_size, fps_type, fps_name


(                fps_hex__type=Morgan__size=2048__radius=2            ZincID
 0       0000000000000000000000000000000000000000000000...  ZINC000044609991
 1       0000000000000000000000000000000000000000000000...  ZINC001565520364
 2       0000000000000000000000000000000000000000000000...  ZINC000001679355
 3       0000000000000000000000000000000000000000000000...  ZINC000003014560
 4       0000000000000000000000000000000000000000000000...  ZINC000028769330
 ...                                                   ...               ...
 999995  0000000000000000000000000000000001000000000000...  ZINC001807465293
 999996  0000000000000000000000000000000001000000000000...  ZINC001811271264
 999997  0000000000000000000000000000000000000000000000...  ZINC001824936928
 999998  0000000000000000000000001010000000000000000000...  ZINC001831695841
 999999  0000000000000000000000000010000000000000000000...  ZINC001845398527
 
 [1000000 rows x 2 columns],
 2048,
 'Morgan',
 'fps_hex__type=Morgan__siz

## Writing to csv

In [31]:
def hex2bin(s, size):
    s_upd = bin(int(s, 16))[2:]
    s_upd = str.zfill(s_upd, fps_size)
    return np.array([int(sym) for sym in s_upd])

In [32]:
def array_from_string(string):
    return np.array([int(sym) for sym in string])


In [46]:
test_bins = fps["fps_hex__type=Morgan__size=2048__radius=2"][:10]

In [54]:
bins = np.array([array_from_string(s) for s in test_bins])
pd.DataFrame(data=bins, columns=[f"fps_{i}" for i in range(fps_size)])

AttributeError: module 'pandas.io.formats.format' has no attribute '_get_adjustment'

   fps_0  fps_1  fps_2  fps_3  fps_4  fps_5  fps_6  fps_7  fps_8  fps_9  ...  \
0      0      0      0      0      0      0      0      0      0      0  ...   
1      0      0      0      0      0      0      0      0      0      0  ...   
2      0      0      0      0      0      0      0      0      0      0  ...   
3      0      0      0      0      0      0      0      0      0      0  ...   
4      0      0      0      0      0      0      0      0      0      0  ...   
5      0      0      0      0      0      0      0      0      0      0  ...   
6      0      0      0      0      0      0      0      0      0      0  ...   
7      0      0      0      0      0      0      1      0      0      0  ...   
8      0      0      0      0      0      0      0      0      0      0  ...   
9      0      0      0      0      0      0      1      0      0      0  ...   

   fps_2038  fps_2039  fps_2040  fps_2041  fps_2042  fps_2043  fps_2044  \
0         0         0         0         0   

In [55]:
def process_single_sdf_file(
    sdf_filename,
    name=fps_name,
    size=fps_size,
    fps_frame=fps,
    ftype=fps_type,
    hexed=False,
    binarised=True,
):

    docking_res = pt.LoadSDF(sdf_filename)
    docking_res = docking_res.rename(
        {
            "A": "Smiles",
            "B": "ZincID",
        },
        axis=1,
    )

    good_columns = ["ZincID", "Score", "Smiles"]
    docking_res = docking_res[good_columns]
    rv = pd.merge(docking_res, fps_frame, on="ZincID")

    if ftype == "Morgan":
        radius = int(name.split("__")[3].split("=")[1])
        if hexed == True:
            out_filename = sdf_filename.replace(
                ".sdf", f"__radius={radius}__size={size}.csv"
            )
            rv.to_csv(out_filename)
            hexed_length = rv.shape[0]
            print(sdf_filename, "-->", out_filename)

        if binarised == True:

            hexes = rv[name]
            #             bins = np.array([hex2bin(s, fps_size) for s in hexes])
            bins = np.array([array_from_string(s) for s in hexes])
            print(f"Bins created, shape {bins.shape} ")

            columns = [f"fps_{i}" for i in range(size)]
            #             new_name = name.replace('fps_hex', 'fps_bin')

            bins = pd.DataFrame(data=bins, columns=columns)
            rv = rv.drop([name], axis=1)
            rv = pd.concat([rv, bins], axis=1)

            out_filename = sdf_filename.replace(
                ".sdf", f"__radius={radius}__size={size}__binarised.csv"
            )
            rv.to_csv(out_filename)
            binarised_length = rv.shape[0]
            print(sdf_filename, "-->", out_filename)

        if (hexed == True) and (binarised == True):
            assert hexed_length == binarised_length, print("Problems")

    #     elif ftype == 'AtomPair':

    #         fps_strings = rv[name]
    #         bins = np.array([array_from_string(string) for string in fps_strings], np.array)

    #         columns=[]
    #         for i in range(size):
    #             columns.append(name + '_' + str(i))

    #         bins = pd.DataFrame(data=bins, columns = columns)

    #         rv = rv.drop([name], axis = 1)
    #         rv = pd.concat([rv, bins], axis = 1)

    #         out_filename = sdf_filename.replace('.sdf',f'_binarised_AP__size={size}.csv')
    #         rv.to_csv(out_filename)

    #         print(sdf_filename, '-->', out_filename)

    return 0

In [57]:
%%time

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

list_of_paths = []

for path, folders, files, in os.walk(docking_results_folder):
    for file in files:
        if os.path.splitext(file)[1] == '.sdf':
            list_of_paths.append(f"{path}/{file}")

with Pool(6) as p:
    answer = p.map(process_single_sdf_file, list_of_paths)

Bins created, shape (9888, 2048) 
Bins created, shape (9776, 2048) 
Bins created, shape (9822, 2048) 
Bins created, shape (9927, 2048) 
Bins created, shape (9970, 2048) 
Bins created, shape (9930, 2048) 
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_6.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_6__radius=2__size=2048__binarised.csv
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_24.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_24__radius=2__size=2048__binarised.csv
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_75.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_75__radius=2__size=2048__binarised.csv
/storage/margaritakovaleva/acc

/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_2.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_2__radius=2__size=2048__binarised.csv
Bins created, shape (9896, 2048) 
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_51.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_51__radius=2__size=2048__binarised.csv
Bins created, shape (9918, 2048) 
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_55.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_55__radius=2__size=2048__binarised.csv
Bins created, shape (9887, 2048) 
Bins created, shape (9733, 2048) 
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_

/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_49.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_49__radius=2__size=2048__binarised.csv
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_31.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_31__radius=2__size=2048__binarised.csv
Bins created, shape (9923, 2048) 
Bins created, shape (9914, 2048) 
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_42.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_42__radius=2__size=2048__binarised.csv
Bins created, shape (9941, 2048) 
Bins created, shape (9951, 2048) 
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_

KeyError: "['Score'] not in index"

In [62]:
ls /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1

'ZINC20_1_full_0__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_0.sdf
'ZINC20_1_full_10__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_10.sdf
'ZINC20_1_full_11__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_11.sdf
'ZINC20_1_full_12__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_12.sdf
'ZINC20_1_full_13__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_13.sdf
'ZINC20_1_full_14__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_14.sdf
'ZINC20_1_full_15__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_15.sdf
'ZINC20_1_full_16__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_16.sdf
'ZINC20_1_full_17__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_17.sdf
'ZINC20_1_full_18__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_18.sdf
'ZINC20_1_full_19__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_19.sdf
'ZINC20_1_full_1__radius=2__size=2048__binarised.csv'
 ZINC20_1_full_1.sdf
'ZINC20_1_full_20__radius=2__size=2048__binarised.csv'


In [68]:
cat /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_99__radius=2__size=2048__binarised.csv

,ZincID,Score,Smiles,fps_0,fps_1,fps_2,fps_3,fps_4,fps_5,fps_6,fps_7,fps_8,fps_9,fps_10,fps_11,fps_12,fps_13,fps_14,fps_15,fps_16,fps_17,fps_18,fps_19,fps_20,fps_21,fps_22,fps_23,fps_24,fps_25,fps_26,fps_27,fps_28,fps_29,fps_30,fps_31,fps_32,fps_33,fps_34,fps_35,fps_36,fps_37,fps_38,fps_39,fps_40,fps_41,fps_42,fps_43,fps_44,fps_45,fps_46,fps_47,fps_48,fps_49,fps_50,fps_51,fps_52,fps_53,fps_54,fps_55,fps_56,fps_57,fps_58,fps_59,fps_60,fps_61,fps_62,fps_63,fps_64,fps_65,fps_66,fps_67,fps_68,fps_69,fps_70,fps_71,fps_72,fps_73,fps_74,fps_75,fps_76,fps_77,fps_78,fps_79,fps_80,fps_81,fps_82,fps_83,fps_84,fps_85,fps_86,fps_87,fps_88,fps_89,fps_90,fps_91,fps_92,fps_93,fps_94,fps_95,fps_96,fps_97,fps_98,fps_99,fps_100,fps_101,fps_102,fps_103,fps_104,fps_105,fps_106,fps_107,fps_108,fps_109,fps_110,fps_111,fps_112,fps_113,fps_114,fps_115,fps_116,fps_117,fps_118,fps_119,fps_120,fps_121,fps_122,fps_123,fps_124,fps_125,fps_126,fps_127,fps_128,fps_129,fps_130,fps_131,fps_132,fps_133,fps_134,fps_135,f

114,ZINC000583946478,-20.179214,Cc1cc([N+](=O)[O-])ccc1C(=O)N1CC(CN2CC(C)(C)c3ccc(-c4cnn(C)c4)cc32)C1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

233,ZINC000728977947,-18.52652,COc1ccc(O)c([C@@H](C)NC(=O)c2ccc(C(=O)N[C@H](C)c3cc(OC)ccc3O)s2)c1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

352,ZINC001844701629,-12.601231,CC[C@@H](C)[C@H](C(=O)N1CC2(C1)CCCN2C(=O)c1cc(N(C)C)ccc1[N+](=O)[O-])c1ccccc1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

472,ZINC001781755117,-12.379869,Cc1ccc(C)n1-c1ccc(C(=O)N2CC[C@]3(C2)CN(C(=O)[C@]2(C)CC=CCC2)CCO3)cc1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

590,ZINC000069932626,-28.211091,O=C(CNC(=O)c1sc2ccccc2c1Cl)NC1CCN(Cc2ccc(Cl)cc2)CC1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6680,ZINC001767331544,-14.83846,CCOC(C)(C)Cn1c(-c2ccc(Cl)cc2OC)nnc1N1CC[C@@H](C2CCOCC2)C1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0

6796,ZINC001764059748,3.311049,CCC(CC)[C@H](Cn1c(C2CCCC2)nnc1N1CCC[C@H](NC(=O)OC(C)(C)C)C1)N(C)C,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0

6912,ZINC001786035827,-22.371273,O=C(CCc1cnc2ccccc2c1)Nc1ccc(C2(NC(=O)c3cnn4ccncc34)CCC2)cc1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

7027,ZINC001788123324,-10.378253,CC(C)N[C@H](C(=O)Nc1ccc(C2(NC(=O)c3cc(-c4ccco4)no3)CCC2)cc1)C(C)C,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

7138,ZINC001786652531,-29.981514,C[C@@H](C(=O)NC1(c2ccc(NC(=O)[C@@H]3CC(=O)N(C4CCCC4)C3)cc2)CCC1)c1cccs1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0

7253,ZINC001791461016,-15.991978,O=C(CCc1ccc(F)cc1)Nc1ccc(C2(NC(=O)[C@H]3Cc4ccccc4CN3)CCC2)cc1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [56]:
%%time

process_single_sdf_file(f'{docking_results_folder}/out/5zty_1/ZINC20_1_full_59.sdf'),


Bins created, shape (9927, 2048) 
/storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_59.sdf --> /storage/margaritakovaleva/accelerated_docking/2023-01-18_second_docking_5zty/out/5zty_1/ZINC20_1_full_59__radius=2__size=2048__binarised.csv
CPU times: user 17.8 s, sys: 1.03 s, total: 18.9 s
Wall time: 18.3 s


(0,)

In [None]:
docking_res = docking_res.rename(
    {
        "A": "Smiles",
        "B": "ZincID",
    },
    axis=1,
)

good_columns = ["ZincID", "Score", "Smiles"]
docking_res = docking_res[good_columns]
rv = pd.merge(docking_res, fps_frame, on="ZincID")

In [None]:
name = fps_name
size = fps_size
fps_frame = fps
ftype = fps_type
hexed = False
binarised = True
radius = int(name.split("__")[3].split("=")[1])

In [None]:
if hexed == True:
    out_filename = sdf_filename.replace(".sdf", f"__radius={radius}__size={size}.csv")
    rv.to_csv(out_filename)
    hexed_length = rv.shape[0]
    print(sdf_filename, "-->", out_filename)

if binarised == True:
    hexes = rv[name]
    bins = np.array([hex2bin(s, fps_size) for s in hexes])
    # columns_names
    columns = []
    new_name = name.replace("fps_hex", "fps_bin")
    for i in range(size):
        columns.append(new_name + "_" + str(i))

    bins = pd.DataFrame(data=bins, columns=columns)
    rv = rv.drop([name], axis=1)
    rv = pd.concat([rv, bins], axis=1)

    out_filename = sdf_filename.replace(
        ".sdf", f"__radius={radius}__size={size}__binarised.csv"
    )
    rv.to_csv(out_filename)
    binarised_length = rv.shape[0]
    print(sdf_filename, "-->", out_filename)