In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from preprocess import (load_data, encode_labels, 
                        split_validation, count_atoms)
from predict import eval_metric, predict_hgbr
from features import distance

## Create Simple Model

### Load Training Data

In [3]:
train, test, coords = load_data()

  mask |= (ar1 == a)


### Create Train-Validation Split

In [4]:
x_train, x_val, y_train, y_val = split_validation(train)

### Load GB Inference Model (GBR, XGB, LGB, HGBR)

Starting with HGBR

In [6]:
y_val_hat = predict_hgbr(x_train, y_train, x_val)

eval_metric(y_val, y_val_hat, x_val.type)

1.1573388617410183

## Feature Creation

### Distance

In [9]:
d = distance(train)
train_d = train.assign(d=d)

x_d_train, x_d_val, y_train, y_val = split_validation(train_d)
y_d_val_hat = predict_hgbr(x_d_train, y_train, x_d_val)

eval_metric(y_val, y_d_val_hat, x_d_val.type)

0.7961766900570408

### Center Molecule Coords

- [x] Find center of molecule by averaging atom positions
- [x] Subtract centroid coords from molecule coords

In [12]:
train, test, coords = load_data(True)

  mask |= (ar1 == a)


In [13]:
train_dc = train.assign(d=d)

x_dc_train, x_dc_val, y_train, y_val = split_validation(train_dc)
y_dc_val_hat = predict_hgbr(x_dc_train, y_train, x_dc_val)

eval_metric(y_val, y_dc_val_hat, x_dc_val.type)

0.7936400298618033

In [14]:
atom_freqs = count_atoms(coords)
atom_freqs.head()

train_af = train_dc.merge(atom_freqs, how='left', on='molecule_name')
train_af.head()

Unnamed: 0,molecule_name,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,d,atom_C,atom_F,atom_H,atom_N,atom_O
0,dsgdb9nsd_000001,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1.091953,1,0,4,0,0
1,dsgdb9nsd_000001,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1.78312,1,0,4,0,0
2,dsgdb9nsd_000001,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,1.783147,1,0,4,0,0
3,dsgdb9nsd_000001,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,1.783157,1,0,4,0,0
4,dsgdb9nsd_000001,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1.091952,1,0,4,0,0


In [15]:
x_af_train, x_af_val, y_af_train, y_af_val = split_validation(train_af)
y_af_hat = predict_hgbr(x_af_train, y_af_train, x_af_val)

eval_metric(y_af_val, y_af_hat, x_af_val.type)

0.770931169583137

### Using LightGBM

In [16]:
import lightgbm as lgb

In [17]:
x_af_train.head()

Unnamed: 0,molecule_name,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,d,atom_C,atom_F,atom_H,atom_N,atom_O
2183914,dsgdb9nsd_067982,1JHC,H,-2.186542,-0.861022,0.297951,C,-1.155135,-0.82379,0.650627,1.090673,7,0,8,0,2
7424,dsgdb9nsd_000397,2JHC,H,0.689308,-0.308381,0.873468,C,-1.314335,-0.609705,0.10843,2.165794,3,0,8,0,3
3532660,dsgdb9nsd_101672,3JHC,H,0.14311,-0.540812,3.551988,C,-0.105774,0.157833,0.122043,3.509212,8,0,18,0,1
1853368,dsgdb9nsd_059465,3JHH,H,0.009353,-0.967569,-0.966649,H,1.93233,1.343685,-0.349033,3.069394,8,0,14,0,1
1751969,dsgdb9nsd_057053,3JHC,H,0.340697,-2.973158,-0.247377,C,0.822307,-0.5581,1.227171,2.870321,6,0,10,2,1


In [18]:
x_lgb_train = encode_labels(x_af_train)
x_lgb_train.head()

Unnamed: 0,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,d,atom_C,atom_F,atom_H,atom_N,atom_O
2183914,0,0,-2.186542,-0.861022,0.297951,0,-1.155135,-0.82379,0.650627,1.090673,7,0,8,0,2
7424,2,0,0.689308,-0.308381,0.873468,0,-1.314335,-0.609705,0.10843,2.165794,3,0,8,0,3
3532660,5,0,0.14311,-0.540812,3.551988,0,-0.105774,0.157833,0.122043,3.509212,8,0,18,0,1
1853368,6,0,0.009353,-0.967569,-0.966649,1,1.93233,1.343685,-0.349033,3.069394,8,0,14,0,1
1751969,5,0,0.340697,-2.973158,-0.247377,0,0.822307,-0.5581,1.227171,2.870321,6,0,10,2,1


In [19]:
train_lgb = lgb.Dataset(x_lgb_train, label=y_af_train)

In [32]:
param = {'objective': 'mae', 'num_leaves': 63}

In [33]:
bst = lgb.train(param, train_lgb)

In [34]:
x_lgb_val = encode_labels(x_af_val)
y_lgb_hat = bst.predict(x_lgb_val)

In [35]:
eval_metric(y_af_val, y_lgb_hat, x_lgb_val.type)

0.6482341051190379

In [None]:
test.head()

In [None]:
test_af = test.merge(atom_freqs, how='left', on='molecule_name')
test_af.head()

In [None]:
test_af = test_af.assign(d=distance(test_af))
test_af.head()

In [None]:
test_lgb = encode_labels(test_af)
test_lgb.head()

In [None]:
y_hat = bst.predict(test_lgb)

In [None]:
y_hat.shape

In [None]:
sample_sub = pd.read_csv('sample_submission.csv')
sample_sub.head()

In [None]:
sample_sub.scalar_coupling_constant = y_hat
sample_sub.head()

In [None]:
sample_sub.to_csv('lgb_sub.csv', index=False)

### Create Features w.r.t. Centroid

- [ ] Find angle between atoms w.r.t. centroid
- [ ] Distance to centroid and to each other

### Molecule-related Features

- [x] Frequency of each atom in molecule
- [ ] Size of molecule (x,y,z)
- [ ] Weight of molecule

### Molecule Graph Features

- [ ] Use software to infer molecular bonds.
- [ ] Number and types of bonds between atoms.
- [ ] Can find dipole moments, potential energy, magnetic shielding tensor from this?