In [None]:
import pandas as pd
import numpy as np

## Create Simple Model

### Load Training Data

- [x] Read train and structures csv files.
- [x] Merge each atom index with xyz coords + element

In [None]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
structs = pd.read_csv('structures.csv')

In [None]:
def add_positions(pairs, structs):
    pairs = pd.merge(pairs, structs, how='left', 
                  left_on=['molecule_name', 'atom_index_0'],
                  right_on=['molecule_name', 'atom_index'],)
    pairs = pd.merge(pairs, structs, how='left',
                  left_on=['molecule_name', 'atom_index_1'],
                  right_on=['molecule_name', 'atom_index'],
                  suffixes=('_0', '_1'), copy=False)
    pairs.drop(columns=['atom_index_0', 'atom_index_1'], inplace=True)
    
    return pairs

In [None]:
train = add_positions(train, structs)
train.head()

In [None]:
test = add_positions(test, structs)
test.head()

### Create Train-Validation Split

- [x] Split data into training set and validation set.
- [x] Remove target variable from data.
- [x] Implement evaluation metric.
- [ ] Use sklearn RandomForestRegressor for fitting.
- [ ] Run on validation data and test data.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y = train.loc[:, 'scalar_coupling_constant'].copy()
y.head()

In [None]:
x = train.drop(columns=['scalar_coupling_constant'])
x.head()

In [None]:
x = x.drop(columns=['molecule_name'])
x.head()

### Sanitize Features

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
le.fit(x.type)
x.loc[:, 'type'] = le.transform(x.type)
x.head()

In [None]:
le.fit(x.atom_0)
x.loc[:, 'atom_0'] = le.transform(x.atom_0)
x.head()

In [None]:
le.fit(x.atom_1)
x.loc[:, 'atom_1'] = le.transform(x.atom_1)
x.head()

### Split

In [None]:
x_train, x_val, y_train, y_val = \
train_test_split(x, y, test_size=.33, random_state=0)

In [None]:
x_train.head()

In [None]:
y_train.head()

### Evaluation Metric

From https://www.kaggle.com/uberkinder/efficient-metric

In [None]:
def eval_metric(y, y_hat, j_type, floor=1e-9):
    maes = (y - y_hat).abs().groupby(j_type).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

### Load GB Inference Model (GBR, XGB, LGB, HGBR)

Starting with HGBR

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
hgbr = HistGradientBoostingRegressor()

In [None]:
hgbr.fit(x_train, y_train)

In [None]:
y_val_hat = hgbr.predict(x_val)

In [None]:
eval_metric(y_val, y_val_hat, x_val.type)

## Feature Creation

### Distance

In [None]:
d = np.sqrt((x.x_1 - x.x_0)**2 + (x.y_1 - x.y_0)**2 + (x.z_1 - x.z_0)**2)
d.head()

In [None]:
x_d = x.assign(d=d)
x_d.head()

In [None]:
x_d_train, x_d_val, y_train, y_val = \
train_test_split(x_d, y, test_size=.33, random_state=0)

In [None]:
hgbr.fit(x_d_train, y_train)

In [None]:
y_d_val_hat = hgbr.predict(x_d_val)

In [None]:
eval_metric(y_val, y_d_val_hat, x_d_val.type)

### Center Molecule Coords

- [ ] Find center of molecule by averaging atom positions
- [ ] Subtract centroid coords from molecule coords

In [None]:
s25 = structs.loc[:25].copy()

In [None]:
mol_mu_25 = s25.groupby('molecule_name')[['x', 'y', 'z']].sum() / s25.groupby('molecule_name')[['x', 'y', 'z']].count()
mol_mu_25

### Create Features w.r.t. Centroid

- [ ] Find angle between atoms w.r.t. centroid
- [ ] Distance to centroid and to each other

### Molecule-related Features

- [ ] Frequency of each atom in molecule
- [ ] Size of molecule (x,y,z)
- [ ] Weight of molecule

### Molecule Graph Features

- [ ] Use software to infer molecular bonds.
- [ ] Number and types of bonds between atoms.
- [ ] Can find dipole moments, potential energy, magnetic shielding tensor from this?