In [1]:
# 基本ライブラリ
import pandas as pd
import pandas.io.sql as psql
import numpy as np
import numpy.random as rd
import gc
import multiprocessing as mp
import os
import sys
import pickle
from collections import defaultdict
from glob import glob
import math
from datetime import datetime as dt
from pathlib import Path
import scipy.stats as st
import re
import shutil
from tqdm import tqdm_notebook as tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # 秒ut(10桁) ⇒ 日付

# グラフ描画系
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc

from matplotlib import animation as ani
from IPython.display import Image

plt.rcParams["patch.force_edgecolor"] = True
#rc('text', usetex=True)
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

#カラム内の文字数。デフォルトは50
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
#
pd.options.display.float_format = '{:,.5f}'.format

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
DATA_VERSION = "v003"
TRIAL_NO = "087"
seed = 2069

In [3]:
importance_list = []
for i in range(8):
    path = f"../log/{DATA_VERSION}_{TRIAL_NO}/importance_{DATA_VERSION}_{TRIAL_NO}_{i}_{seed}.csv"
    try:
        importance_df = pd.read_csv(path, index_col=0)
        importance_df = pd.concat([g.set_index("feature")[["importance"]].rename({"importance":f"fold_{i}"}, axis=1) for i, g in importance_df.groupby("fold")],axis=1)
        importance_df["ave"] = importance_df.mean(axis=1)
        importance_df["ratio"] = importance_df["ave"] / importance_df["ave"].sum()
        importance_df.sort_values("ratio", ascending=False ,inplace=True)
        importance_list += [importance_df]
        #break
    except Exception as e:
        print(e)

In [4]:
importance_df.reset_index()

Unnamed: 0,feature,fold_1,fold_2,fold_3,fold_4,fold_5,ave,ratio
0,oof_fc,543539.03401,543148.50255,541739.80372,544455.6718,542450.74944,543066.7523,0.94043
1,acsf_31_0,1975.82924,1893.73639,1910.27297,1928.7659,1944.48796,1930.61849,0.00334
2,cosT,1103.52196,1200.64721,1147.27301,1111.41471,1116.69911,1135.9112,0.00197
3,3Jd_idx1_2nd,1052.8714,1114.8467,1035.53462,1031.88562,1061.97539,1059.42275,0.00183
4,cos2T,834.59175,1015.56164,849.80865,801.77211,854.86843,871.32052,0.00151
5,dist_C_0_y,709.80835,751.26265,893.93617,805.17888,763.47266,784.73174,0.00136
6,3Jlast_AverageBondAngle,746.56216,714.78019,621.27231,515.08798,598.97341,639.33521,0.00111
7,1J_ex1_dist_from_first_min,654.73915,588.20841,610.52095,713.24742,622.6936,637.88191,0.0011
8,interBond_Length,400.1904,433.43219,462.49309,457.40383,494.3274,449.56938,0.00078
9,3J3rd_AverageBondAngle,396.28018,440.7004,415.10102,410.06506,385.594,409.54813,0.00071


In [5]:
importance_all_df = pd.concat([importance_list[i]["ratio"] for i in range(len(importance_list))], axis=1)

importance_all_df.columns = [f"type_{i}" for i in range(8)]
importance_all_df["ave"] = importance_all_df.mean(axis=1)
importance_all_df.sort_values("ave", ascending=False ,inplace=True)
importance_all_df.to_excel(f"../log/{DATA_VERSION}_{TRIAL_NO}/importance_all.xlsx")
importance_all_df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,ave
oof_fc,0.77495,0.99672,0.86109,0.99398,0.98675,0.97044,0.98838,0.94043,0.93909
1J1st_AveSmallestBondAngle_diff,0.21983,2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.02748
1J_ex1_SmallestBondAngle_max,1e-05,2e-05,0.11851,9e-05,7e-05,0.00025,3e-05,0.00029,0.01491
cosT,0.0,0.0,0.0,0.0,0.0,0.00047,0.00255,0.00197,0.00062
2Jd_idx1_2nd,0.0,0.0,0.00398,4e-05,0.00041,0.0,0.0,0.0,0.00055
cos2T,0.0,0.0,0.0,0.0,0.0,0.00235,0.00015,0.00151,0.0005
mmff94_1,2e-05,1e-05,8e-05,0.0,8e-05,0.00306,0.0,0.00037,0.00045
acsf_31_0,1e-05,1e-05,4e-05,2e-05,4e-05,9e-05,2e-05,0.00334,0.00045
2J2nd_AverageBondAngle,0.0,0.0,0.0002,9e-05,0.00263,0.0,0.0,0.0,0.00037
1J_ex1_dist_0_min,1e-05,1e-05,0.00251,2e-05,3e-05,0.00016,2e-05,0.00018,0.00037


In [18]:
importance_all_df.iloc[-11:].index

Index(['a1_hybridization', 'a1_nb_inring7', 'a0_nb_nb_h', 'a1_nb_n', 'a1_nb_c',
       'a1_nb_inring8', 'a1_nb_nb_na', 'a1_nb_h', 'a1_nb_na', 'type',
       'a0_nb_nb_na'],
      dtype='object')

In [41]:
col=['a1_degree', 'a1_hybridization',
'a1_inring', 'a1_inring3', 'a1_inring4', 'a1_inring5', 'a1_inring6',
'a1_inring7', 'a1_inring8', 'a1_nb_h', 'a1_nb_o', 'a1_nb_c', 'a1_nb_n',
'a1_nb_na', 'a0_nb_degree', 'a0_nb_hybridization', 'a0_nb_inring',
'a0_nb_inring3', 'a0_nb_inring4', 'a0_nb_inring5', 'a0_nb_inring6',
'a0_nb_inring7', 'a0_nb_inring8', 'a0_nb_nb_h', 'a0_nb_nb_o',
'a0_nb_nb_c', 'a0_nb_nb_n', 'a0_nb_nb_na', 'x_a0_nb', 'y_a0_nb',
'z_a0_nb', 'a1_nb_degree', 'a1_nb_hybridization', 'a1_nb_inring',
'a1_nb_inring3', 'a1_nb_inring4', 'a1_nb_inring5', 'a1_nb_inring6',
'a1_nb_inring7', 'a1_nb_inring8', 'a1_nb_nb_h', 'a1_nb_nb_o',
'a1_nb_nb_c', 'a1_nb_nb_n', 'a1_nb_nb_na', 'x_a1_nb', 'y_a1_nb',
'z_a1_nb', 'dist_to_type_mean']

In [43]:
for c in col:
    print(c)

a1_degree
a1_hybridization
a1_inring
a1_inring3
a1_inring4
a1_inring5
a1_inring6
a1_inring7
a1_inring8
a1_nb_h
a1_nb_o
a1_nb_c
a1_nb_n
a1_nb_na
a0_nb_degree
a0_nb_hybridization
a0_nb_inring
a0_nb_inring3
a0_nb_inring4
a0_nb_inring5
a0_nb_inring6
a0_nb_inring7
a0_nb_inring8
a0_nb_nb_h
a0_nb_nb_o
a0_nb_nb_c
a0_nb_nb_n
a0_nb_nb_na
x_a0_nb
y_a0_nb
z_a0_nb
a1_nb_degree
a1_nb_hybridization
a1_nb_inring
a1_nb_inring3
a1_nb_inring4
a1_nb_inring5
a1_nb_inring6
a1_nb_inring7
a1_nb_inring8
a1_nb_nb_h
a1_nb_nb_o
a1_nb_nb_c
a1_nb_nb_n
a1_nb_nb_na
x_a1_nb
y_a1_nb
z_a1_nb
dist_to_type_mean


In [44]:
del_cols = """a0_nb_nb_c
a0_nb_inring5
a1_inring4
a1_inring5
a0_nb_nb_n
a1_nb_nb_c
a0_nb_inring4
a1_nb_nb_h
a1_nb_n
a0_nb_inring6
a1_inring6
a0_nb_nb_h
a1_nb_c
a1_inring3
a0_nb_nb_o
a1_nb_nb_n
a1_nb_inring5
a1_nb_nb_o
a1_nb_o
a1_nb_inring4
a1_inring7
a1_nb_inring3
a0_nb_inring
a1_nb_degree
a1_nb_h
a0_nb_inring7
a1_nb_inring6
a1_inring
F
a0_nb_inring3
a1_degree
a1_hybridization
a1_inring8
a0_nb_hybridization
a1_nb_hybridization
a1_nb_inring
a1_nb_inring7
a0_nb_inring8
a0_nb_degree
a1_nb_inring8
sp
a1_nb_na
a1_nb_nb_na
type
a0_nb_nb_na""".split("\n")

In [45]:
del_cols

['a0_nb_nb_c',
 'a0_nb_inring5',
 'a1_inring4',
 'a1_inring5',
 'a0_nb_nb_n',
 'a1_nb_nb_c',
 'a0_nb_inring4',
 'a1_nb_nb_h',
 'a1_nb_n',
 'a0_nb_inring6',
 'a1_inring6',
 'a0_nb_nb_h',
 'a1_nb_c',
 'a1_inring3',
 'a0_nb_nb_o',
 'a1_nb_nb_n',
 'a1_nb_inring5',
 'a1_nb_nb_o',
 'a1_nb_o',
 'a1_nb_inring4',
 'a1_inring7',
 'a1_nb_inring3',
 'a0_nb_inring',
 'a1_nb_degree',
 'a1_nb_h',
 'a0_nb_inring7',
 'a1_nb_inring6',
 'a1_inring',
 'F',
 'a0_nb_inring3',
 'a1_degree',
 'a1_hybridization',
 'a1_inring8',
 'a0_nb_hybridization',
 'a1_nb_hybridization',
 'a1_nb_inring',
 'a1_nb_inring7',
 'a0_nb_inring8',
 'a0_nb_degree',
 'a1_nb_inring8',
 'sp',
 'a1_nb_na',
 'a1_nb_nb_na',
 'type',
 'a0_nb_nb_na']

In [48]:
use_cols = pd.read_csv("../log/v003_009/use_cols.csv", index_col=0)

In [52]:
for c in [c for c in use_cols["columns"] if c not in del_cols]:
    print(c)

molecule_atom_index_0_dist_min
molecule_atom_index_0_dist_max
molecule_atom_index_1_dist_min
molecule_atom_index_0_dist_mean
molecule_atom_index_0_dist_std
dist
abs_dist
x_0
y_0
z_0
x_1
y_1
z_1
molecule_atom_index_1_dist_std
molecule_atom_index_1_dist_max
molecule_atom_index_1_dist_mean
molecule_atom_index_0_dist_max_diff
molecule_atom_index_0_dist_max_div
molecule_atom_index_0_dist_std_diff
molecule_atom_index_0_dist_std_div
atom_0_couples_count
molecule_atom_index_0_dist_min_div
molecule_atom_index_1_dist_std_diff
molecule_atom_index_0_dist_mean_div
atom_1_couples_count
molecule_atom_index_0_dist_mean_diff
molecule_couples
atom_index_1
molecule_dist_mean
molecule_atom_index_1_dist_max_diff
molecule_atom_index_0_y_1_std
molecule_atom_index_1_dist_mean_diff
molecule_atom_index_1_dist_std_div
molecule_atom_index_1_dist_mean_div
molecule_atom_index_1_dist_min_diff
molecule_atom_index_1_dist_min_div
molecule_atom_index_1_dist_max_div
molecule_atom_index_0_z_1_std
molecule_type_dist_std_di

In [53]:
rdkit_cols = ['id', 'a1_degree', 'a1_hybridization',
              'a1_inring', 'a1_inring3', 'a1_inring4', 'a1_inring5', 'a1_inring6',
              'a1_inring7', 'a1_inring8', 'a1_nb_h', 'a1_nb_o', 'a1_nb_c', 'a1_nb_n',
              'a1_nb_na', 'a0_nb_degree', 'a0_nb_hybridization', 'a0_nb_inring',
              'a0_nb_inring3', 'a0_nb_inring4', 'a0_nb_inring5', 'a0_nb_inring6',
              'a0_nb_inring7', 'a0_nb_inring8', 'a0_nb_nb_h', 'a0_nb_nb_o',
              'a0_nb_nb_c', 'a0_nb_nb_n', 'a0_nb_nb_na', 'x_a0_nb', 'y_a0_nb',
              'z_a0_nb', 'a1_nb_degree', 'a1_nb_hybridization', 'a1_nb_inring',
              'a1_nb_inring3', 'a1_nb_inring4', 'a1_nb_inring5', 'a1_nb_inring6',
              'a1_nb_inring7', 'a1_nb_inring8', 'a1_nb_nb_h', 'a1_nb_nb_o',
              'a1_nb_nb_c', 'a1_nb_nb_n', 'a1_nb_nb_na', 'x_a1_nb', 'y_a1_nb',
              'z_a1_nb', 'dist_to_type_mean']

In [54]:
[c for c in rdkit_cols if c not in del_cols]

['id',
 'x_a0_nb',
 'y_a0_nb',
 'z_a0_nb',
 'x_a1_nb',
 'y_a1_nb',
 'z_a1_nb',
 'dist_to_type_mean']

In [8]:
np.log(0.114891)

-2.1637714261770253

In [9]:
np.log( 0.113502)

-2.175934821070556

In [10]:

scalar_coupling_contributions = pd.read_csv(f'../input/scalar_coupling_contributions.csv')

In [11]:
scalar_coupling_contributions.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
0,dsgdb9nsd_000001,1,0,1JHC,83.0224,0.25458,1.25862,0.27201
1,dsgdb9nsd_000001,1,2,2JHH,-11.0347,0.35298,2.85839,-3.4336
2,dsgdb9nsd_000001,1,3,2JHH,-11.0325,0.35294,2.85852,-3.43387
3,dsgdb9nsd_000001,1,4,2JHH,-11.0319,0.35293,2.85855,-3.43393
4,dsgdb9nsd_000001,2,0,1JHC,83.0222,0.25459,1.25861,0.27201
