In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../modules/')

In [None]:
import pandas as pd
import numpy as np

from preprocess import (load_data, encode_labels, 
                        split_validation, count_atoms, 
                        add_atom_count, add_distance, 
                        center_cos, center_d)
from predict import eval_metric, predict_hgbr
from features import distance

## Create Simple Model

### Load Training Data

In [None]:
train, test, coords = load_data()

### Create Train-Validation Split

In [None]:
x_train, x_val, y_train, y_val = split_validation(train)

### Load GB Inference Model (GBR, XGB, LGB, HGBR)

Starting with HGBR

In [None]:
y_val_hat = predict_hgbr(x_train, y_train, x_val)

eval_metric(y_val, y_val_hat, x_val.type)

## Feature Creation

### Distance

In [None]:
d = distance(train)
train_d = train.assign(d=d)

x_d_train, x_d_val, y_train, y_val = split_validation(train_d)
y_d_val_hat = predict_hgbr(x_d_train, y_train, x_d_val)

eval_metric(y_val, y_d_val_hat, x_d_val.type)

### Center Molecule Coords

- [x] Find center of molecule by averaging atom positions
- [x] Subtract centroid coords from molecule coords

In [None]:
train, test, coords = load_data(True)

In [None]:
train_dc = train.assign(d=d)

x_dc_train, x_dc_val, y_train, y_val = split_validation(train_dc)
y_dc_val_hat = predict_hgbr(x_dc_train, y_train, x_dc_val)

eval_metric(y_val, y_dc_val_hat, x_dc_val.type)

In [None]:
atom_freqs = count_atoms(coords)
atom_freqs.head()

train_af = train_dc.merge(atom_freqs, how='left', on='molecule_name')
train_af.head()

In [None]:
x_af_train, x_af_val, y_af_train, y_af_val = split_validation(train_af)
y_af_hat = predict_hgbr(x_af_train, y_af_train, x_af_val)

eval_metric(y_af_val, y_af_hat, x_af_val.type)

### Using LightGBM

In [None]:
import lightgbm as lgb

In [None]:
x_af_train.head()

In [None]:
x_lgb_train = encode_labels(x_af_train)
x_lgb_train.head()

In [None]:
train_lgb = lgb.Dataset(x_lgb_train, label=y_af_train)

In [None]:
param = {'objective': 'mae', 'num_leaves': 63}

In [None]:
bst = lgb.train(param, train_lgb)

In [None]:
x_lgb_val = encode_labels(x_af_val)
y_lgb_hat = bst.predict(x_lgb_val)

In [None]:
eval_metric(y_af_val, y_lgb_hat, x_lgb_val.type)

In [None]:
test.head()

In [None]:
test_af = test.merge(atom_freqs, how='left', on='molecule_name')
test_af.head()

In [None]:
test_af = test_af.assign(d=distance(test_af))
test_af.head()

In [None]:
test_lgb = encode_labels(test_af)
test_lgb.head()

In [None]:
y_hat = bst.predict(test_lgb)

In [None]:
y_hat.shape

In [None]:
sample_sub = pd.read_csv('sample_submission.csv')
sample_sub.head()

In [None]:
sample_sub.scalar_coupling_constant = y_hat
sample_sub.head()

In [None]:
sample_sub.to_csv('lgb_sub.csv', index=False)

### Create Features w.r.t. Centroid

- [x] Find angle between atoms w.r.t. centroid
- [x] Distance to centroid and to each other

### Molecule-related Features

- [x] Frequency of each atom in molecule
- [ ] Size of molecule (x,y,z)
- [ ] Weight of molecule

### Molecule Graph Features

- [ ] Use software to infer molecular bonds.
- [ ] Number and types of bonds between atoms.
- [ ] Can find dipole moments, potential energy, magnetic shielding tensor from this?

## Make Preprocess Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

Steps for data transformation:

1. [x] Coordinates from structures.csv
2. [x] Encode types into numerical labels.
3. [x] Encode molecule atoms into columns with atom counts.
4. [x] Center molecule coordinates.
5. [x] Add angle w.r.t. centroid feature. (cos of angle is good enough)
6. Add angle w.r.t. nearest atom feature.
7. [x] Add distance feature.
8. Add columns of bond orders btw atoms.

In [None]:
train, test, coords = load_data()
train, test = encode_labels(train, test)
train, test = add_atom_count(train, test, coords)
train, test = add_distance(train, test)
train, test = center_cos(train, test)
train, test = center_d(train, test)

In [None]:
train, test = (train.drop(columns='molecule_name'), test.drop(columns='molecule_name'))

In [None]:
# use lightgbm for prediction
import lightgbm as lgb

x_t, x_v, y_t, y_v = split_validation(train)
train_lgb = lgb.Dataset(x_t, label=y_t)
param = {'objective': 'mae', 'num_leaves': 200}

h = lgb.train(param, train_lgb)
y_v_hat = h.predict(x_v)

In [None]:
eval_metric(y_v, y_v_hat, x_v.type)

In [None]:
y_hat = h.predict(test)
sub = pd.read_csv('sample_submission.csv')
sub.loc[:, 'scalar_coupling_constant'] = y_hat
sub.to_csv('lgb_sub_2.csv', index=False)

## TODO

- [ ] Fix data loading paths.
- [ ] Implement K-Fold cross-validation. 
- [ ] Add features from artgor kernel.
- [ ] Add angle feature from the other kernel
- [ ] Divide the dataset into the different J-coupling types and check the error in each.
- [ ] Optimize hyperparams in LightGBM. Try using XGBoost.
- [ ] Use bond type and charge features.
- [ ] Create more features from the bond type and charge.
- [ ] Try predicting some of the auxillary data (like Mulliken charges)

In [None]:
train, test, coords = load_data()