In [1]:
%%html
<style type="text/css">
.CodeMirror pre, .output pre { font-family: Consolas, Monaco, monospace; }
.text_cell_render { font-family: Consolas, Monaco, monospace; }
</style>

In [10]:

import numpy as np
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm

from rdkit import Chem
from mol2vec.features import mol2alt_sentence
import pybel

from gensim.models import Word2Vec


def xyz_to_smiles(fname: str) -> str:
    mol = next(pybel.readfile("xyz", fname))
    smi = mol.write(format="smi")
    return smi.split()[0].strip()

file_dir = '../input/structures/'
mols_files=os.listdir(file_dir)
mols_index=dict(map(reversed,enumerate(mols_files)))
mol_name = list(mols_index.keys())

In [98]:

import sys
sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle

In [27]:
mol_name = np.sort(mol_name)

In [28]:
import multiprocessing as mp
n_split = mp.cpu_count()
unit = np.ceil(len(mol_name) / n_split).astype(int)
indexer = [[unit * (i), unit * (i + 1)] for i in range(n_split)]

split_mol_names = []
for idx in indexer:
    split_mol_names.append(mol_name[idx[0]:idx[1]])

mp_data = [{"mol_names": m} for m in split_mol_names]

In [31]:

def xyz_to_smiles(data) -> str:
    ret_list = []
    for fname in data["mol_names"]:
        mol = next(pybel.readfile("xyz", file_dir+fname))
        smi = mol.write(format="smi")
        ret_list += [smi.split()[0].strip()]
    return ret_list
                                        
num_workers = mp.cpu_count()
with mp.Pool(num_workers) as executor:
    features_chunk = executor.map(xyz_to_smiles, mp_data)

In [34]:
features_chunk[0]

['C',
 'N',
 'O',
 'C#C',
 'C#N',
 'CC',
 'CO',
 'CC#C',
 'CC#N',
 'CC=O',
 'NC=O',
 'CCC',
 'CCO',
 'COC',
 'C1CC1',
 'C1CO1',
 'CC(=O)C',
 'CC(=O)N',
 'NC(=O)N',
 'CC(C)C',
 'CC(C)O',
 'C(#C)C#C',
 'C(#C)C#N',
 'O=CC#C',
 'O=CC#N',
 'O=CC=O',
 'CC#CC',
 'CCC#C',
 'CCC#N',
 'NCC#N',
 'OCC#C',
 'OCC#N',
 'CCC=O',
 'CNC=O',
 'COC=O',
 'OCC=O',
 'CCCC',
 'CCCO',
 'CCOC',
 'OCCO',
 'CC1CC1',
 'C[C@H]1CO1',
 'CN1CC1',
 'OC1CC1',
 'C1CCC1',
 'C1COC1',
 'CC(=NO)C',
 '[nH]1cccc1',
 '[nH]1ccnc1',
 'o1cccc1',
 'o1ccnc1',
 'CC(C)(C)C',
 'CC(C)(C)O',
 'CC(=O)C#C',
 'CC(=O)C#N',
 'NC(=O)C#C',
 'CC(=O)C=O',
 'NC(=O)C=O',
 'CC(C)C#C',
 'CC(C)C#N',
 'C[C@@H](N)C#N',
 'C[C@@H](O)C#C',
 'C[C@@H](O)C#N',
 'CC(C)C=O',
 'C[C@@H](O)C=O',
 'CN(C)C=O',
 'CC(=O)CO',
 'CCC(=O)C',
 'CCC(=O)N',
 'CNC(=O)C',
 'CNC(=O)N',
 'COC(=N)C',
 'COC(=O)C',
 'COC(=O)N',
 'NC(=O)CO',
 'CC(C)CO',
 'C[C@@H](O)CO',
 'CCC(C)C',
 'CC[C@@H](C)O',
 'COC(C)C',
 'CC1(C)CC1',
 'CC1(C)CO1',
 'CC1(O)CC1',
 'N=C1CCO1',
 'O=C1CCC1',
 'O=C

In [35]:
smiles = np.concatenate(features_chunk)

In [38]:
smiles[110]

'CC#CC#N'

In [39]:
df_smiles = pd.DataFrame({'molecule_name': mol_name, 'smiles': smiles})

Unnamed: 0,molecule_name,smiles
0,dsgdb9nsd_000001.xyz,C
1,dsgdb9nsd_000002.xyz,N
2,dsgdb9nsd_000003.xyz,O
3,dsgdb9nsd_000004.xyz,C#C
4,dsgdb9nsd_000005.xyz,C#N


In [40]:
df_smiles.sort_values("molecule_name", inplace=True)

In [41]:
df_smiles.head()

Unnamed: 0,molecule_name,smiles
0,dsgdb9nsd_000001.xyz,C
1,dsgdb9nsd_000002.xyz,N
2,dsgdb9nsd_000003.xyz,O
3,dsgdb9nsd_000004.xyz,C#C
4,dsgdb9nsd_000005.xyz,C#N


In [42]:
smiles = df_smiles["smiles"].values

In [None]:
model = Word2Vec.load('../input/model_300dim.pkl')

In [77]:
def func(smiles):
    ret_list = []
    for s in tqdm(smiles): #data["smiles"]:
        try:
            sentence = mol2alt_sentence(Chem.MolFromSmiles(s), 1)
        except Exception as e:
            ret_list += [[np.nan]*300]
            continue
            
        for cnt in range(len(sentence)):
            try:
                ret_list += [model.wv[sentence[cnt]]]
                break
            except Exception as e:
                #print(s)
                #print(sentence)
                pass
            
            ret_list += [[np.nan]*300]
    return ret_list

In [78]:
mol_vec = func(smiles)

HBox(children=(IntProgress(value=0, max=130775), HTML(value='')))

In [80]:
mol_vec = np.array(mol_vec)

In [81]:
mol_vec.shape

(130775, 300)

In [82]:
mol_vec_df = pd.DataFrame(mol_vec)

In [83]:
mol_vec_df.isna().sum()

0      2054
1      2054
2      2054
3      2054
4      2054
       ... 
295    2054
296    2054
297    2054
298    2054
299    2054
Length: 300, dtype: int64

In [91]:
structures = pd.read_csv("../input/structures.csv")
train = pd.read_csv("../input/train.csv")
mol_vec_df["molecule_name"] = np.sort(structures.molecule_name.unique())

In [93]:
mol_vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,molecule_name
0,,,,,,,,,,,...,,,,,,,,,,dsgdb9nsd_000001
1,-0.306596,0.22323,-0.165032,0.340601,0.122465,0.127082,-0.055324,-0.256667,0.035283,0.287777,...,0.39287,0.049481,0.146393,0.178297,-0.338194,0.009021,0.305351,-0.513758,0.322367,dsgdb9nsd_000002
2,-0.181573,-0.156637,-0.001124,-0.450791,0.018562,0.175651,-0.293198,-0.016114,0.120915,0.38491,...,0.369871,0.038316,0.128176,0.131276,0.024598,-0.127479,-0.072179,-0.445017,0.018748,dsgdb9nsd_000003
3,0.211415,0.029348,0.35128,0.371282,0.780693,-0.09997,-0.497356,0.217432,0.49835,-0.008529,...,0.021449,0.37044,0.156537,-0.936061,0.257772,-0.111415,-0.470363,-0.260219,0.209732,dsgdb9nsd_000004
4,0.211415,0.029348,0.35128,0.371282,0.780693,-0.09997,-0.497356,0.217432,0.49835,-0.008529,...,0.021449,0.37044,0.156537,-0.936061,0.257772,-0.111415,-0.470363,-0.260219,0.209732,dsgdb9nsd_000005


In [94]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [95]:
train = train.merge(mol_vec_df, on="molecule_name", how="left")

In [101]:
mol_vec_df.columns = [f"vec_{c}" for c in range(300)] + ["molecule_name"]

In [102]:
mol_vec_df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299,molecule_name
0,,,,,,,,,,,...,,,,,,,,,,dsgdb9nsd_000001
1,-0.306596,0.22323,-0.165032,0.340601,0.122465,0.127082,-0.055324,-0.256667,0.035283,0.287777,...,0.39287,0.049481,0.146393,0.178297,-0.338194,0.009021,0.305351,-0.513758,0.322367,dsgdb9nsd_000002
2,-0.181573,-0.156637,-0.001124,-0.450791,0.018562,0.175651,-0.293198,-0.016114,0.120915,0.38491,...,0.369871,0.038316,0.128176,0.131276,0.024598,-0.127479,-0.072179,-0.445017,0.018748,dsgdb9nsd_000003
3,0.211415,0.029348,0.35128,0.371282,0.780693,-0.09997,-0.497356,0.217432,0.49835,-0.008529,...,0.021449,0.37044,0.156537,-0.936061,0.257772,-0.111415,-0.470363,-0.260219,0.209732,dsgdb9nsd_000004
4,0.211415,0.029348,0.35128,0.371282,0.780693,-0.09997,-0.497356,0.217432,0.49835,-0.008529,...,0.021449,0.37044,0.156537,-0.936061,0.257772,-0.111415,-0.470363,-0.260219,0.209732,dsgdb9nsd_000005


In [None]:
to_pickle("../processed/v003/mol_vec_df.pkl", mol_vec_df)