In [13]:

import os
import sys
import os
import datetime
import json
import gc
import time
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from numba import jit

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold
import warnings
warnings.filterwarnings("ignore")

sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import current_time, unpickle, to_pickle, reduce_mem_usage

import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true - y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [23]:
use_cols = """molecule_atom_index_0_dist_min
molecule_atom_index_0_dist_max
molecule_atom_index_1_dist_min
molecule_atom_index_0_dist_mean
molecule_atom_index_0_dist_std
dist
abs_dist
x_0
y_0
z_0
x_1
y_1
z_1
molecule_atom_index_1_dist_std
molecule_atom_index_1_dist_max
molecule_atom_index_1_dist_mean
molecule_atom_index_0_dist_max_diff
molecule_atom_index_0_dist_max_div
molecule_atom_index_0_dist_std_diff
molecule_atom_index_0_dist_std_div
atom_0_couples_count
molecule_atom_index_0_dist_min_div
molecule_atom_index_1_dist_std_diff
molecule_atom_index_0_dist_mean_div
atom_1_couples_count
molecule_atom_index_0_dist_mean_diff
molecule_couples
atom_index_1
molecule_dist_mean
molecule_atom_index_1_dist_max_diff
molecule_atom_index_0_y_1_std
molecule_atom_index_1_dist_mean_diff
molecule_atom_index_1_dist_std_div
molecule_atom_index_1_dist_mean_div
molecule_atom_index_1_dist_min_diff
molecule_atom_index_1_dist_min_div
molecule_atom_index_1_dist_max_div
molecule_atom_index_0_z_1_std
molecule_type_dist_std_diff
molecule_atom_1_dist_min_diff
molecule_atom_index_0_x_1_std
molecule_dist_min
molecule_atom_index_0_dist_min_diff
molecule_atom_index_0_y_1_mean_diff
molecule_type_dist_min
molecule_atom_1_dist_min_div
atom_index_0
molecule_dist_max
molecule_atom_1_dist_std_diff
molecule_type_dist_max
molecule_atom_index_0_y_1_max_diff
molecule_type_0_dist_std_diff
molecule_type_dist_mean_diff
molecule_atom_1_dist_mean
molecule_atom_index_0_y_1_mean_div
molecule_type_dist_mean_div
type
f004:angle
f004:angle_abs
f006:dist_origin_mean
f006:dist_from_origin_0
f006:dist_from_origin_1
Angle
Torsion
cos2T
cosT
dist_xy
dist_xz
dist_yz
C
H
N
O
eem_0
mmff94_0
gasteiger_0
qeq_0
qtpie_0
eem2015ha_0
eem2015hm_0
eem2015hn_0
eem2015ba_0
eem2015bm_0
eem2015bn_0
eem_1
mmff94_1
gasteiger_1
qeq_1
qtpie_1
eem2015ha_1
eem2015hm_1
eem2015hn_1
eem2015ba_1
eem2015bm_1
eem2015bn_1
dist_H_0_x
dist_H_1_x
dist_H_2_x
dist_H_3_x
dist_H_4_x
dist_C_0_x
dist_C_1_x
dist_C_2_x
dist_C_3_x
dist_C_4_x
dist_N_0_x
dist_N_1_x
dist_N_2_x
dist_N_3_x
dist_N_4_x
dist_O_0_x
dist_O_1_x
dist_O_2_x
dist_O_3_x
dist_F_0_x
dist_F_1_x
dist_H_0_y
dist_H_1_y
dist_H_2_y
dist_H_3_y
dist_H_4_y
dist_C_0_y
dist_C_1_y
dist_C_2_y
dist_C_3_y
dist_C_4_y
dist_N_0_y
dist_N_1_y
dist_N_2_y
dist_N_3_y
dist_N_4_y
dist_O_0_y
dist_O_1_y
dist_O_2_y
dist_O_3_y
dist_F_0_y
dist_F_1_y
tda_max_radius
tda_min_radius
a1_degree
a1_hybridization
a1_inring
a1_inring3
a1_inring4
a1_inring5
a1_inring6
a1_inring7
a1_inring8
a1_nb_h
a1_nb_o
a1_nb_c
a1_nb_n
a1_nb_na
a0_nb_degree
a0_nb_hybridization
a0_nb_inring
a0_nb_inring3
a0_nb_inring4
a0_nb_inring5
a0_nb_inring6
a0_nb_inring7
a0_nb_inring8
a0_nb_nb_h
a0_nb_nb_o
a0_nb_nb_c
a0_nb_nb_n
a0_nb_nb_na
x_a0_nb
y_a0_nb
z_a0_nb
a1_nb_degree
a1_nb_hybridization
a1_nb_inring
a1_nb_inring3
a1_nb_inring4
a1_nb_inring5
a1_nb_inring6
a1_nb_inring7
a1_nb_inring8
a1_nb_nb_h
a1_nb_nb_o
a1_nb_nb_c
a1_nb_nb_n
a1_nb_nb_na
x_a1_nb
y_a1_nb
z_a1_nb
dist_to_type_mean""".split("\n")

In [4]:
DATA_VERSION = "v003"
TRIAL_NO = "029"

sys.path.append("../src")
import importlib
use_cols = importlib.import_module(f'use_cols_{DATA_VERSION}_{TRIAL_NO}')
# use_cols = importlib.import_module(f'use_cols')
print(use_cols.good_columns)

save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}")
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}")
log_path.mkdir(parents=True, exist_ok=True)

['molecule_atom_index_0_dist_min', 'molecule_atom_index_0_dist_max', 'molecule_atom_index_1_dist_min', 'molecule_atom_index_0_dist_mean', 'molecule_atom_index_0_dist_std', 'dist', 'abs_dist', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', 'molecule_atom_index_1_dist_std', 'molecule_atom_index_1_dist_max', 'molecule_atom_index_1_dist_mean', 'molecule_atom_index_0_dist_max_diff', 'molecule_atom_index_0_dist_max_div', 'molecule_atom_index_0_dist_std_diff', 'molecule_atom_index_0_dist_std_div', 'atom_0_couples_count', 'molecule_atom_index_0_dist_min_div', 'molecule_atom_index_1_dist_std_diff', 'molecule_atom_index_0_dist_mean_div', 'atom_1_couples_count', 'molecule_atom_index_0_dist_mean_diff', 'molecule_couples', 'atom_index_1', 'molecule_dist_mean', 'molecule_atom_index_1_dist_max_diff', 'molecule_atom_index_0_y_1_std', 'molecule_atom_index_1_dist_mean_diff', 'molecule_atom_index_1_dist_std_div', 'molecule_atom_index_1_dist_mean_div', 'molecule_atom_index_1_dist_min_diff', 'molecule_atom_inde

In [24]:
train = unpickle(save_path/f"v003_029/train_concat_v003_029.pkl", )
test = unpickle(save_path/f"v003_029/test_concat_v003_029.pkl", )

X = train[use_cols].copy()
y = train['scalar_coupling_constant']
X_test = test[use_cols].copy()

In [25]:
len(use_cols)

188

In [26]:
train.shape, X.shape

((4658147, 233), (4658147, 188))

In [7]:
seed = 71
model = unpickle(model_path / f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", )

In [29]:
columns=None
columns = X.columns if columns is None else columns
X_train, X_valid, y_train, y_valid = train_test_split(X[columns], y, test_size=0.1, random_state=42)

In [30]:
model

LGBMRegressor(bagging_seed=72, boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, feature_fraction_seed=73,
       importance_type='split', learning_rate=0.2, max_depth=9,
       metric='mae', min_child_samples=79, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=8000, n_jobs=-1, num_leaves=128,
       num_threads=-1, objective='regression', random_state=None,
       reg_alpha=0.1, reg_lambda=0.3, seed=71, silent=True, subsample=0.9,
       subsample_for_bin=200000, subsample_freq=1, verbosity=-1)

In [34]:
X_train.head()

Unnamed: 0,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_max,molecule_atom_index_1_dist_min,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_std,dist,abs_dist,x_0,y_0,z_0,...,a1_nb_inring8,a1_nb_nb_h,a1_nb_nb_o,a1_nb_nb_c,a1_nb_nb_n,a1_nb_nb_na,x_a1_nb,y_a1_nb,z_a1_nb,dist_to_type_mean
3930804,1.095932,3.61736,1.093289,2.472062,0.94771,1.095932,1.718096,-1.485696,1.792534,0.339412,...,0,0,0,1,0,0,-1.485696,1.792534,0.339412,1.002775
258790,1.105069,3.072265,1.765062,2.43799,0.563292,2.503735,3.664633,0.576237,0.100403,-2.064712,...,0,3,0,1,0,0,2.135812,-0.695696,-0.800948,0.926581
1533600,1.095765,3.314278,1.083687,2.42128,0.767309,3.157974,4.865069,1.722459,4.999507,1.215382,...,0,0,0,1,0,0,1.339926,2.703822,-2.219744,1.025633
3222881,1.094762,2.870522,1.10426,2.152555,0.761025,2.171913,3.065948,-0.353353,2.066271,0.767856,...,0,0,0,1,0,0,-1.104999,-0.190652,0.091709,0.991561
3064518,1.087312,3.573952,2.133517,2.489014,0.836157,2.133517,2.843518,3.883479,0.313637,1.55684,...,0,1,0,1,0,0,4.963581,1.165528,-1.389071,0.974032


In [40]:
from tqdm import tqdm_notebook as tqdm

In [46]:
y_pred_valid = model.predict(X_valid)
score = group_mean_log_mae(y_valid, y_pred_valid, X_valid['type'])
print(f"score: {score}")

score: -1.1120130623643119


In [42]:
np.random.seed(71)
result_list = []
for c in tqdm(X_valid.columns):
    if c  == "type": continue
    X_valid_ = X_valid.copy()
    np.random.shuffle(X_valid_[c].values)
    y_pred_valid = model.predict(X_valid_)
    score = group_mean_log_mae(y_valid, y_pred_valid, X_valid_['type'])
    print(f"col:{c}, score: {score}")
    result_list += [{"col":c, "score":score}]

HBox(children=(IntProgress(value=0, max=188), HTML(value='')))

col:molecule_atom_index_0_dist_min, score: -0.8016989676908528
col:molecule_atom_index_0_dist_max, score: -0.835570230386695
col:molecule_atom_index_1_dist_min, score: -0.8568424887135307
col:molecule_atom_index_0_dist_mean, score: -1.0151867421155965
col:molecule_atom_index_0_dist_std, score: -1.0647992259171388
col:dist, score: 0.5974748596606333
col:abs_dist, score: -1.048671312384502
col:x_0, score: -1.100368103716089
col:y_0, score: -1.0827545244819905
col:z_0, score: -1.1016572002387568
col:x_1, score: -1.0942721757895792
col:y_1, score: -1.0637109338257633
col:molecule_atom_index_0_dist_max_diff, score: -0.7983006755317116
col:molecule_atom_index_0_dist_max_div, score: -0.7245568108524868
col:molecule_atom_index_0_dist_std_diff, score: -0.8999561148573347
col:molecule_atom_index_0_dist_std_div, score: -0.7957186021297646
col:atom_0_couples_count, score: -1.0617480907255836
col:molecule_atom_index_0_dist_min_div, score: -0.620996527273829
col:molecule_atom_index_1_dist_std_diff, 

col:y_a1_nb, score: -1.07390355946317
col:z_a1_nb, score: -1.0972085090788661
col:dist_to_type_mean, score: -0.008194903762386407


In [43]:
result_df = pd.DataFrame(result_list)

In [44]:
result_df

Unnamed: 0,col,score
0,molecule_atom_index_0_dist_min,-0.801699
1,molecule_atom_index_0_dist_max,-0.835570
2,molecule_atom_index_1_dist_min,-0.856842
3,molecule_atom_index_0_dist_mean,-1.015187
4,molecule_atom_index_0_dist_std,-1.064799
5,dist,0.597475
6,abs_dist,-1.048671
7,x_0,-1.100368
8,y_0,-1.082755
9,z_0,-1.101657


In [45]:
result_df.to_csv("./permutation_importance.csv")

In [None]:
print(1)