In [1]:
import numpy as np
import pandas as pd
import fastai
from tqdm import tqdm_notebook as tqdm
from fastai.tabular import *
import pickle

from multiprocessing import Pool
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
np.range = (lambda x:(x.min(), x.max()))

In [2]:
# train = pd.read_csv("train.csv")
# test = pd.read_csv('test.csv')
structures = pd.read_csv('../input/structures.csv')

In [3]:
def add_structure_features(df):
    df['dist'] = (df['x']**2 + df['y']**2 + df['z']**2).apply(np.sqrt)
    df['v_x'], df['v_y'], df['v_z']  =  df['x']/df['dist'], df['y']/df['dist'], df['z']/df['dist']
    
    df['a_x'], df['a_y'], df['a_z']  =  df['x'].apply(np.abs), df['y'].apply(np.abs), df['z'].apply(np.abs)
    df['s_x'], df['s_y'], df['s_z']  =  df['x'].apply(np.sign), df['y'].apply(np.sign), df['z'].apply(np.sign)
    
    df['min'] = df[['x', 'y', 'z']].apply(np.abs).min(1)
    df['max'] = df[['x', 'y', 'z']].apply(np.abs).max(1)
    
    return df

In [4]:
structures = add_structure_features(structures)

In [5]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,dist,v_x,v_y,v_z,a_x,a_y,a_z,s_x,s_y,s_z,min,max
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,1.085908,-0.011694,0.999904,0.007368,0.012698,1.085804,0.008001,-1.0,1.0,1.0,0.008001,1.085804
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,0.006701,0.3209,-0.900035,0.29489,0.00215,0.006031,0.001976,1.0,-1.0,1.0,0.001976,0.006031
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,1.779373,0.568589,0.822622,0.000155,1.011731,1.463751,0.000277,1.0,1.0,1.0,0.000277,1.463751
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,1.776603,-0.30441,0.814772,-0.493438,0.540815,1.447527,0.876644,-1.0,1.0,-1.0,0.540815,1.447527
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,1.778648,-0.294501,0.808442,0.509599,0.523814,1.437933,0.906397,-1.0,1.0,1.0,0.523814,1.437933


In [6]:
for col in ['x', 'y', 'z', 'dist', 'v_x', 'v_y', 'v_z', 'a_x', 'a_y', 'a_z', 'min', 'max']:
    structures[col] = ((structures[col] - structures[col].mean())/structures[col].std()).astype(np.float32)
    
for col in ['s_x', 's_y', 's_z']:
    structures[col] = structures[col].astype(np.float32)

structures.atom_index = structures.atom_index.astype(np.int16)
structures.describe()

Unnamed: 0,atom_index,x,y,z,dist,v_x,v_y,v_z,a_x,a_y,a_z,s_x,s_y,s_z,min,max
count,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0,2358657.0
mean,8.757349,-6.803775e-07,-3.788459e-06,-3.663693e-07,-2.452273e-08,1.185681e-06,-1.664546e-05,3.657462e-07,7.860106e-07,-2.692544e-07,1.494633e-06,0.07780232,-0.2077784,0.03618627,-1.908379e-06,6.010725e-07
std,5.592487,0.9991393,0.9982817,0.9991086,0.9991422,0.9992731,0.9991241,0.9991654,0.9991232,0.9992008,0.9992303,0.998116,0.983276,0.9982372,0.9996186,0.9990478
min,0.0,-5.636408,-4.826277,-6.361002,-2.060529,-1.883654,-1.420496,-2.025684,-1.169818,-1.324815,-1.098794,-1.0,-1.0,-1.0,-0.9922283,-2.037827
25%,4.0,-0.5857057,-0.7502785,-0.6258549,-0.5749148,-0.7537246,-0.8606139,-0.7410564,-0.7860946,-0.8430145,-0.858613,-1.0,-1.0,-1.0,-0.8036761,-0.6367755
50%,9.0,-0.02601123,-0.03511799,-0.03560693,-0.177933,0.01564048,-0.2368678,-0.03102189,-0.2377276,-0.0601735,-0.1862439,1.0,-1.0,1.0,-0.2782271,-0.1613324
75%,13.0,0.6169443,0.8583548,0.6065696,0.6214064,0.810725,1.109337,0.7375404,0.5979137,0.4883163,0.4768986,1.0,1.0,1.0,0.4691264,0.558412
max,28.0,5.610773,5.286521,5.417029,5.97148,1.763019,1.597627,1.934751,7.539006,7.054461,8.278973,1.0,1.0,1.0,8.492958,7.072009


In [7]:
atom_encoder = LabelEncoder()
atom_encoder = atom_encoder.fit(structures.atom)
structures.atom = atom_encoder.transform(structures.atom) + 1
structures.atom = structures.atom.astype(np.int64)

In [8]:
def get_mol_df(name, structures=structures):
    return structures[structures.molecule_name == name]

def parse_mol(df):
    #print(df)
    #name = df.iloc[0].molecule_name
    df = df.copy()
    n_atoms = len(df)
    df.set_index('atom_index', verify_integrity=True, inplace=True)
    df.sort_index(inplace=True)
    ret = []
    ret.append(np.zeros((16,)))
    for row in df.itertuples():
        ret.append(list(row[-16:]))
    for _ in range(len(ret), 30): ret.append(np.zeros(16))
    del df
    return np.array(ret), n_atoms

def get_mol(x):
    return parse_mol(get_mol_df(x))

In [9]:
structures.molecule_name.unique().shape

(130775,)

In [10]:
%%time
molecules = structures.groupby('molecule_name').apply(parse_mol)
molecules = dict(molecules)
print()


CPU times: user 6min 53s, sys: 2.21 s, total: 6min 55s
Wall time: 6min 54s


In [11]:
molecule_names = set(structures.molecule_name)
get_mol('dsgdb9nsd_000001')

(array([[ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
        [ 1.      , -0.064998,  0.713642, -0.037634, ...,  1.      ,  1.      , -0.97594 , -1.066691],
        [ 3.      , -0.056028,  0.164747, -0.041801, ..., -1.      ,  1.      , -0.988205, -2.032808],
        [ 3.      ,  0.553891,  0.903646, -0.042977, ...,  1.      ,  1.      , -0.991665, -0.728526],
        ...,
        [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
        [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
        [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
        [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ]]),
 5)

In [12]:
for key in molecules:
    molecules[key] = (molecules[key][1], tensor(molecules[key][0]).type(torch.float32))

In [13]:
with open('molecules.pkl', 'wb') as f:
    pickle.dump([molecules, structures.columns.values[-16:].tolist(), atom_encoder], f)

In [14]:
molecules[np.random.choice(list(molecule_names))]

(15, tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00],
         [ 1.0000e+00, -8.2353e-03,  7.9202e-01,  1.7872e-01, -1.0748e+00,
           5.4980e-02,  1.5467e+00,  4.4891e-01, -1.0944e+00, -3.0295e-01,
          -7.6943e-01,  1.0000e+00,  1.0000e+00,  1.0000e+00, -8.2680e-01,
          -9.2720e-01],
         [ 4.0000e+00, -5.5959e-02,  7.9465e-02, -1.6716e-02, -1.9229e+00,
          -3.7359e-02, -1.3858e+00,  3.7576e-01, -1.1677e+00, -1.1802e+00,
          -1.0595e+00,  1.0000e+00, -1.0000e+00,  1.0000e+00, -9.8762e-01,
          -1.8810e+00],
         [ 1.0000e+00, -7.2900e-01, -2.0406e-01, -1.7429e-01, -1.0259e+00,
          -1.5633e+00, -7.3900e-01, -3.2382e-01, -1.3782e-01, -7.1612e-01,
          -9.0416e-01, -1.0000e+00, -1.0000e+00, -1.0000e+00, -6.0627e-01,
          -1.0434e+00],


In [15]:
molecules[np.random.choice(list(molecule_names))][1].shape

torch.Size([30, 16])

In [16]:
##sanity check
assert molecule_names.intersection(set(molecules.keys()))