In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import sys
from pathlib import Path
from time import time
import numpy as np
import pandas as pd
from glob import glob
import matplotlib
import matplotlib.pyplot as plt
print(Path.cwd())

In [16]:
def sizeof(data, verbose=True):
    sz = sys.getsizeof(data)/1e9
    if verbose: print(f'Size in GB: {sz}')
    return sz

In [3]:
fea_main_dir = Path('/vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/')
fea_type = 'descriptors'
fea_dir = fea_main_dir/fea_type
fea_files = sorted( fea_dir.glob('OZD-*.csv') )
print(len(fea_files))

668


# Load descriptors

In [4]:
dd_prfx = 'dd'
dd_sep = '_'

In [5]:
fea_col_names = pd.read_csv(fea_main_dir/'headers.csv').columns.tolist()
fea_col_names = [c.strip() for c in fea_col_names] # clean col names
print(fea_col_names[:3])
print(len(fea_col_names))

['ABC', 'ABCGG', 'nAcid']
1826


In [6]:
fea_col_names = [dd_prfx+dd_sep+str(c) for c in fea_col_names] # prefix fea cols
cols = ['CAT', 'TITLE', 'SMILES'] + fea_col_names

# Load single fea df

In [36]:
dd0 = pd.read_csv( Path(fea_files[0]), names=cols )
print(dd0.shape)
print('Expected size when including drugs', sizeof(dd0, verbose=False) * len(fea_files))
display(dd0[:2])

(10000, 1829)
Size in GB: 0.148535416


Unnamed: 0,CAT,TITLE,SMILES,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,dd_SpMax_A,dd_SpDiam_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,OZD,ZINC000095370606,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,22.494165,16.342575,0.0,3.0,37.88776,2.578878,5.044375,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,OZD,ZINC000040149497,c1ccc(cc1)N2CC[NH+](CC2)C/C(=[NH+]/OCC(=O)NCc3...,22.20804,15.742774,0.0,3.0,37.409843,2.317886,4.635773,...,9.928814,63.701084,417.19205,7.319159,3030.0,38.0,142.0,158.0,8.527778,6.527778


# Agg all fea files to single file

In [23]:
# dfs = []
# dd0 = pd.read_csv( Path(fea_files[0]), names=cols )
# dd1 = pd.read_csv( Path(fea_files[1]), names=cols )
# dfs.append(dd0)
# dfs.append(dd1)
# df = pd.concat(dfs, axis=0).shape
# sizeof(dfs[0])*600

In [31]:
df = None
t0 = time()
dfs = []

for i, f in enumerate(fea_files[:100]):
    print(f'Loading {i+1} ... {f}')
    dd = pd.read_csv( Path(fea_files[0]), names=cols )
    # print(sizeof(dd));
    dfs.append(dd)
    
runtime = time() - t0
print('\nRuntime: {:.2f} mins'.format( runtime/60 ))    

Loading 1 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-0-10000.csv
Loading 2 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-10000-20000.csv
Loading 3 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-100000-110000.csv
Loading 4 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-1000000-1010000.csv
Loading 5 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-1010000-1020000.csv
Loading 6 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-1020000-1030000.csv
Loading 7 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-1030000-1040000.csv
Loading 8 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD-1040000-1050000.csv
Loading 9 ... /vol/ml/apartin/projects/covid-19/mol-features/data/raw/OZD-Kyle/descriptors/OZD

In [42]:
fea_df = pd.concat(dfs, axis=0)
print(fea_df.shape)
print(sizeof(fea_df));

(1000000, 1829)
Size in GB: 14.861525632
14.861525632


# Load docking scores

In [43]:
# meta_path = Path('/vol/ml/apartin/projects/covid-19/mol-features/nbs/OZD.May29.all.csv')
dock_main_dir = Path('/vol/ml/apartin/projects/covid-19/mol-features/data/raw/docking-2020-06-01/all/OZD')
dock = pd.read_csv(dock_main_dir/'3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.csv')
print(dock.shape)
display(dock[:2])

(6109329, 4)


Unnamed: 0,Inchi-key,SMILES,TITLE,Chemgauss4
0,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285
1,DGLCKPUNCXSIDP-UHFFFAOYSA-P,c1ccc(cc1)N2CC[NH+](CC2)C/C(=[NH+]/OCC(=O)NCc3...,ZINC000040149497,-14.439569


In [44]:
dd_trg = pd.merge(dock, fea_df, how='inner', on=['TITLE', 'SMILES'])
print(dd_trg.shape)
print(sizeof(dd_trg));
display(dd_trg[:2])

(1000000, 1831)
Size in GB: 14.953525632
14.953525632


Unnamed: 0,Inchi-key,SMILES,TITLE,Chemgauss4,CAT,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285,OZD,22.494165,16.342575,0.0,3.0,37.88776,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285,OZD,22.494165,16.342575,0.0,3.0,37.88776,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334


In [45]:
meta_cols = ['CAT', 'Inchi-key', 'SMILES', 'TITLE', 'Chemgauss4']
dd_trg = dd_trg[ meta_cols + fea_col_names ]
print(dd_trg.shape)
print(sizeof(dd_trg));
display(dd_trg[:2])

(1000000, 1831)
Size in GB: 14.953525632
14.953525632


Unnamed: 0,CAT,Inchi-key,SMILES,TITLE,Chemgauss4,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285,22.494165,16.342575,0.0,3.0,37.88776,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.27285,22.494165,16.342575,0.0,3.0,37.88776,...,10.458378,78.30884,377.282,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334


In [47]:
dd_trg.shape

(1000000, 1831)

In [46]:
dd_trg

Unnamed: 0,CAT,Inchi-key,SMILES,TITLE,Chemgauss4,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.272850,22.494165,16.342575,0.0,3.0,37.887760,...,10.458378,78.30884,377.28200,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
1,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.272850,22.494165,16.342575,0.0,3.0,37.887760,...,10.458378,78.30884,377.28200,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
2,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.272850,22.494165,16.342575,0.0,3.0,37.887760,...,10.458378,78.30884,377.28200,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
3,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.272850,22.494165,16.342575,0.0,3.0,37.887760,...,10.458378,78.30884,377.28200,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
4,OZD,MQQAVNHQRUNOBV-SDHOMARFSA-P,Cc1ccc(cc1)[C@@H]2C[N@@H+]([C@@H]3[C@H]2N4CCC3...,ZINC000095370606,-15.272850,22.494165,16.342575,0.0,3.0,37.887760,...,10.458378,78.30884,377.28200,5.988603,2056.0,46.0,156.0,189.0,7.111111,6.083334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,OZD,SJTOMVPZRFLKOA-NHCUHLMSSA-P,C[NH+](C)C[C@@H]1C[N@@H+](C[C@@H]1CO)Cc2cnn(c2...,ZINC000091656771,-11.771907,21.310663,16.290300,0.0,2.0,35.373165,...,10.148040,77.93240,366.24088,6.425278,2086.0,39.0,144.0,169.0,7.750000,5.916666
999996,OZD,SJTOMVPZRFLKOA-NHCUHLMSSA-P,C[NH+](C)C[C@@H]1C[N@@H+](C[C@@H]1CO)Cc2cnn(c2...,ZINC000091656771,-11.771907,21.310663,16.290300,0.0,2.0,35.373165,...,10.148040,77.93240,366.24088,6.425278,2086.0,39.0,144.0,169.0,7.750000,5.916666
999997,OZD,SJTOMVPZRFLKOA-NHCUHLMSSA-P,C[NH+](C)C[C@@H]1C[N@@H+](C[C@@H]1CO)Cc2cnn(c2...,ZINC000091656771,-11.771907,21.310663,16.290300,0.0,2.0,35.373165,...,10.148040,77.93240,366.24088,6.425278,2086.0,39.0,144.0,169.0,7.750000,5.916666
999998,OZD,SJTOMVPZRFLKOA-NHCUHLMSSA-P,C[NH+](C)C[C@@H]1C[N@@H+](C[C@@H]1CO)Cc2cnn(c2...,ZINC000091656771,-11.771907,21.310663,16.290300,0.0,2.0,35.373165,...,10.148040,77.93240,366.24088,6.425278,2086.0,39.0,144.0,169.0,7.750000,5.916666
