In [1]:
# 基本ライブラリ
import pandas as pd
import pandas.io.sql as psql
import numpy as np
import numpy.random as rd
import gc
import multiprocessing as mp
import os
import sys
import pickle
from collections import defaultdict
from glob import glob
import math
from datetime import datetime as dt
from pathlib import Path
import scipy.stats as st
import re
import shutil
from tqdm import tqdm_notebook as tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # 秒ut(10桁) ⇒ 日付

# グラフ描画系
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc

from matplotlib import animation as ani
from IPython.display import Image

plt.rcParams["patch.force_edgecolor"] = True
#rc('text', usetex=True)
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

#カラム内の文字数。デフォルトは50
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
#
pd.options.display.float_format = '{:,.5f}'.format

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
DATA_VERSION = "v003"
TRIAL_NO = "093"
seed = 2069

In [3]:
path = f"../log/{DATA_VERSION}_{TRIAL_NO}/importance_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv"
importance_df = pd.read_csv(path, index_col=0)
importance_df = importance_df.pivot_table(values="importance", columns="fold", index="feature")
importance_df["ave"] = importance_df.mean(axis=1)
importance_df.sort_values("ave", inplace=True, ascending=False)
importance_df = importance_df.reset_index()

In [4]:
importance_df["ratio"] = importance_df["ave"] / importance_df["ave"].sum()

In [5]:
importance_df.to_excel(f"../log/{DATA_VERSION}_{TRIAL_NO}/importance_fc_all.xlsx")

In [6]:
importance_df["max"] = importance_df.iloc[:, :5].max(axis=1)

In [7]:
importance_df.sort_values("max", ascending=False)

fold,feature,1,2,3,4,5,ave,ratio,max
0,1J1st_AverageBondAngle,10127935611.26363,10128546478.46309,10092101273.27353,10117353282.61989,10064038541.19566,10105995037.36316,0.9159,10128546478.46309
1,dist_C_0_x,279505476.87502,279029829.68422,279139796.09266,279432238.4238,278922846.01418,279206037.41798,0.0253,279505476.87502
2,eem2015ha_1,98970265.10322,97918683.60356,100903396.79585,64045004.65914,100969724.32483,92561414.89732,0.00839,100903396.79585
4,molecule_type_dist_std_diff,87798140.13422,62904457.22691,62480740.71265,66616321.19971,6407710.04247,57241473.86319,0.00519,87798140.13422
3,inv_distPR,72364948.79101,62345161.20018,78553709.12838,74291936.2304,74434819.42264,72398114.95452,0.00656,78553709.12838
5,2Jd_idx1_2nd,24460900.81395,50182326.91563,49655873.97539,47073350.98572,105806265.25481,55435743.5891,0.00502,50182326.91563
6,mmff94_0,48232860.75935,49037350.36742,49118384.55565,49003086.84181,49653983.53859,49009133.21256,0.00444,49118384.55565
7,1Jlast_GetPartialCharge,18570103.28582,10734010.39099,46760657.85921,20200088.00952,61960947.58012,31645161.42513,0.00287,46760657.85921
12,eem2015ba_1,3982053.90081,5512307.91377,4649351.29248,38814866.10628,6918415.59982,11975398.96263,0.00109,38814866.10628
8,1J1st_AveSmallestBondAngle_diff,32914264.39739,30538318.51914,31457962.51153,30915137.92492,28030733.66624,30771283.40384,0.00279,32914264.39739


In [9]:

qm9_cols = ['rc_A', 'rc_B', 'rc_C', 'mu', 'alpha',
       'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv',
       'freqs_min', 'freqs_max', 'freqs_mean', 'linear', 'mulliken_min',
       'mulliken_max', 'mulliken_mean', 'mulliken_atom_0', 'mulliken_atom_1']
tmp = importance_df.sort_values("ratio", ascending=False)
tmp[tmp.feature.isin(qm9_cols)]

fold,feature,1,2,3,4,5,ave,ratio,max
24,mulliken_atom_0,3515606.26718,3472943.95035,3434507.83088,3675282.74278,3457118.90372,3511091.93898,0.00032,3675282.74278
129,mulliken_atom_1,240661.28613,308203.24116,325769.79556,275291.04842,285669.29111,287118.93247,3e-05,325769.79556
176,gap,157010.14573,176418.00965,132151.27763,143936.75272,176316.8939,157166.61592,1e-05,176418.00965
217,lumo,143204.7359,110740.2604,97294.70134,84006.06401,81203.81863,103289.91606,1e-05,143204.7359
291,freqs_mean,48015.50117,29241.50419,40870.46331,50503.43129,38332.09814,41392.59962,0.0,50503.43129
296,homo,36323.63716,40326.37372,39452.18974,44078.13755,37276.62515,39491.39266,0.0,44078.13755
305,mu,38018.03603,40023.67815,39075.87113,28793.20144,38243.83842,36830.92503,0.0,40023.67815
324,freqs_min,26684.44489,36681.99492,27528.53056,27947.45349,27935.68862,29355.6225,0.0,36681.99492
331,rc_C,21237.76054,23493.90985,26389.43226,35247.10554,23605.92635,25994.82691,0.0,35247.10554
336,rc_A,26034.73839,23697.25358,25557.27633,24451.22167,25464.30881,25040.95976,0.0,26034.73839


In [10]:
for c in qm9_cols:
    print(c)

rc_A
rc_B
rc_C
mu
alpha
homo
lumo
gap
r2
zpve
U0
U
H
G
Cv
freqs_min
freqs_max
freqs_mean
linear
mulliken_min
mulliken_max
mulliken_mean
mulliken_atom_0
mulliken_atom_1


In [27]:
for c in importance_df[importance_df.ratio<0.000006].feature.values:
    print(c)

acsf_53_0
acsf_44_0
acsf_58_1
acsf_32_1
acsf_9_0
acsf_20_1
acsf_59_1
acsf_14_1
acsf_7_1
acsf_20_0
acsf_12_1
acsf_55_1
acsf_57_1
acsf_12_0
acsf_53_1
acsf_57_0
acsf_10_1
acsf_4_0
acsf_8_0
acsf_17_1
acsf_54_0
acsf_52_1
acsf_54_1
acsf_43_1
acsf_4_1
acsf_58_0
acsf_56_0
acsf_66_1
acsf_41_0
acsf_56_1
acsf_42_0
acsf_40_0
acsf_8_1
acsf_42_1
acsf_43_0
acsf_40_1
acsf_41_1
acsf_60_1
acsf_63_1
acsf_66_0
acsf_61_1
acsf_67_1
acsf_64_0
acsf_64_1
acsf_16_0
acsf_67_0
acsf_19_1
acsf_65_1
acsf_16_1
acsf_65_0
acsf_62_0
acsf_19_0
acsf_63_0
acsf_61_0
acsf_18_1
acsf_76_1
acsf_18_0
acsf_62_1
acsf_78_1
acsf_17_0
acsf_77_1
acsf_76_0
acsf_68_0
acsf_79_1
acsf_71_1
acsf_75_0
acsf_69_1
acsf_71_0
acsf_77_0
acsf_69_0
acsf_72_1
acsf_70_1
acsf_74_0
acsf_73_1
acsf_78_0
acsf_60_0
acsf_72_0
acsf_75_1
acsf_68_1
acsf_74_1
acsf_79_0
acsf_73_0
acsf_70_0


In [26]:
new_cols = [
'1J_ex1_dist_fromEx1_mean', '1J_ex1_dist_fromEx1_max',
'1J_ex1_dist_fromEx1_min', '1J_ex1_dist_fromEx1_std',
'1J_ex1_dist_fromEx1_max_min_diff', '1J_ex1_dist_from_first_mean',
'1J_ex1_dist_from_first_max', '1J_ex1_dist_from_first_min',
'1J_ex1_dist_from_first_std', '1J_ex1_dist_from_first_max_min_diff',
'1J_ex1_dist_from_last_mean', '1J_ex1_dist_from_last_max',
'1J_ex1_dist_from_last_min', '1J_ex1_dist_from_last_std',
'1J_ex1_dist_from_last_max_min_diff', '1J_ex1_angle_fromEx1_mean',
'1J_ex1_angle_fromEx1_max', '1J_ex1_angle_fromEx1_min',
'1J_ex1_angle_fromEx1_std', '1J_ex1_angle_fromEx1_max_min_diff',
'1J_ex1_torsion_F_L_EX1_mean', '1J_ex1_torsion_F_L_EX1_max',
'1J_ex1_torsion_F_L_EX1_min', '1J_ex1_torsion_F_L_EX1_std',
'1J_ex1_torsion_F_L_EX1_max_min_diff', '1J_ex1_cosT_F_L_EX1_mean',
'1J_ex1_cosT_F_L_EX1_max', '1J_ex1_cosT_F_L_EX1_min',
'1J_ex1_cosT_F_L_EX1_std', '1J_ex1_cosT_F_L_EX1_max_min_diff',
'1J_ex1_cos2T_F_L_EX1_mean', '1J_ex1_cos2T_F_L_EX1_max',
'1J_ex1_cos2T_F_L_EX1_min', '1J_ex1_cos2T_F_L_EX1_std',
'1J_ex1_cos2T_F_L_EX1_max_min_diff', '1J_ex1_Angle_0_1_mean',
'1J_ex1_Angle_0_1_max', '1J_ex1_Angle_0_1_min', '1J_ex1_Angle_0_1_std',
'1J_ex1_Angle_0_1_max_min_diff', '1J_ex1_dist_0_mean',
'1J_ex1_dist_0_max', '1J_ex1_dist_0_min', '1J_ex1_dist_0_std',
'1J_ex1_dist_0_max_min_diff', '1J_ex1_dist_1_mean', '1J_ex1_dist_1_max',
'1J_ex1_dist_1_min', '1J_ex1_dist_1_std', '1J_ex1_dist_1_max_min_diff',
'1J_ex1_GetHyb_mean', '1J_ex1_GetHyb_max', '1J_ex1_GetHyb_min',
'1J_ex1_GetHeteroValence_mean', '1J_ex1_GetHeteroValence_max',
'1J_ex1_GetHeteroValence_min', '1J_ex1_GetValence_mean',
'1J_ex1_GetValence_max', '1J_ex1_GetValence_min',
'1J_ex1_SpinMultiplicity_mean', '1J_ex1_SpinMultiplicity_max',
'1J_ex1_SpinMultiplicity_min', '1J_ex1_FormalCharge_mean',
'1J_ex1_FormalCharge_max', '1J_ex1_FormalCharge_min',
'1J_ex1_SmallestBondAngle_mean', '1J_ex1_SmallestBondAngle_max',
'1J_ex1_SmallestBondAngle_min', '1J_ex1_AverageBondAngle_mean',
'1J_ex1_AverageBondAngle_max', '1J_ex1_AverageBondAngle_min',
'1J_ex1_GetPartialCharge_mean', '1J_ex1_GetPartialCharge_max',
'1J_ex1_GetPartialCharge_min']

In [31]:
exist_col_imp = importance_df[~importance_df.feature.isin(new_cols)]
exist_col_imp

fold,feature,1,2,3,4,5,ave,ratio
0,1J1st_AverageBondAngle,10080677288.82402,10090126617.65196,10033922448.04514,10073079506.20924,10031007892.48314,10061762750.6427,0.91199
1,dist_C_0_x,279514583.18735,279674336.84655,278600590.18089,279462929.227,278542620.54377,279159011.99711,0.0253
2,eem2015ha_1,100919572.94378,100619237.02426,98915397.03644,67416031.57009,101818715.08395,93937790.7317,0.00851
3,1Jlast_GetPartialCharge,74267532.39007,57456197.15193,113497034.97996,74471851.26933,115722871.41892,87083097.44204,0.00789
4,dist_to_type_mean,69255281.12507,79713200.32747,76917648.11682,83895929.19298,88895341.02391,79735479.95725,0.00723
5,2Jd_idx1_2nd,10921335.64564,62987181.45479,65716053.31873,90081471.56174,100880120.6334,66117232.52286,0.00599
6,molecule_type_dist_std_diff,102731120.12066,48789766.38528,46519347.45391,22545016.68344,14077322.39353,46932514.60736,0.00425
7,mmff94_0,44281206.03289,45516943.46718,45174921.52883,43693455.88701,44283574.5596,44590020.2951,0.00404
8,2J2nd_AverageBondAngle,34351711.20969,32642501.08684,34551317.38892,34188705.72675,30634957.3653,33273838.5555,0.00302
9,1J1st_AveSmallestBondAngle_diff,35419469.55182,34115235.80343,32412012.50054,34026480.3144,22465840.00008,31687807.63405,0.00287


In [34]:
for c in exist_col_imp[exist_col_imp.ratio < 0.00001].feature.tolist():
    print(c)

molecule_atom_index_0_dist_mean_diff
d_O_from2nd_ratio_0
molecule_atom_index_0_dist_mean
interBond_IsInRing
dist_N_1_x
qeq_0
molecule_atom_index_0_dist_std_div
3J2nd_AverageBondAngle
qtpie_0
3J3rd__C
molecule_dist_min
2Jdist_from2nd_max_mean_diff
a1_inring5
molecule_type_dist_mean_diff
interBond_IsRotor
mean_circle_size
n_circle
2Jdist_from2nd_mean
mean_dist_ratio_O_from2nd
1J1st_CountFreeOxygens
molecule_atom_index_1_dist_max_diff
interBond_BondOrder
max_circle_size
3J3rd_isChiral
a0_nb_inring4
1J1st_BOSum
a0_nb_inring3
1J1st_IsAromatic
molecule_atom_index_1_dist_mean_diff
3Jdist_from2nd_max_mean_diff
3J3rd__N
molecule_atom_index_1_dist_mean_div
pca_exp_1
pca_exp_2
2J2nd_isChiral
3Jlast_MemberOfRingSize
1Jlast_GetValence
3J3rd__O
a0_nb_nb_n
1J1st_IsInRing
min_circle_size
1Jlast_GetImplicitValence
3J2nd_isChiral
1Jlast_GetAtomicNum
interBond_IsAmider
3Jlast_isChiral
interBond_IsDoubleBondGeometry
interBond_IsClosure
1J1st_ExplicitHydrogenCount
3J3rd_isAroma
1J1st_IsChiral
1Jlast_GetHvy

In [27]:
new_col_imp = importance_df[importance_df.feature.isin(new_cols)]
new_col_imp

fold,feature,1,2,3,4,5,ave,ratio
13,1J_ex1_angle_fromEx1_mean,15111072.89989,15160596.14558,3965521.84576,4120104.40444,3609662.12277,8393391.48369,0.00076
15,1J_ex1_angle_fromEx1_min,3403609.19541,5430309.08313,7698493.20745,10461930.48694,8709251.39935,7140718.67446,0.00065
19,1J_ex1_angle_fromEx1_std,5517333.85148,6651239.86708,7154443.71288,6556880.66427,5862694.55976,6348518.5311,0.00058
32,1J_ex1_Angle_0_1_max,2826743.88346,1107374.79863,1122102.46134,3099219.60466,2950397.04705,2221167.55903,0.0002
34,1J_ex1_Angle_0_1_min,506144.45336,1949336.09338,2805986.25149,1978686.15403,2931667.08364,2034364.00718,0.00018
36,1J_ex1_dist_from_first_std,1186764.8103,1535416.43849,1560882.80864,2357184.14031,1941765.8041,1716402.80037,0.00016
37,1J_ex1_dist_from_first_max_min_diff,1571181.0823,1677041.04539,1782176.42848,1550144.39905,1894818.01591,1695072.19423,0.00015
40,1J_ex1_Angle_0_1_mean,1433954.55384,1747373.74023,1459347.63854,1465032.09725,1346252.69904,1490392.14578,0.00014
45,1J_ex1_AverageBondAngle_max,1092924.73508,1172050.73822,1117197.71854,1397787.42038,1207371.31198,1197466.38484,0.00011
46,1J_ex1_dist_1_std,1022904.41386,1233790.79896,827434.54661,1435485.96688,1339506.3883,1171824.42292,0.00011


In [29]:
new_col_imp[new_col_imp.ratio >= 0.00003].feature.tolist()

['1J_ex1_angle_fromEx1_mean',
 '1J_ex1_angle_fromEx1_min',
 '1J_ex1_angle_fromEx1_std',
 '1J_ex1_Angle_0_1_max',
 '1J_ex1_Angle_0_1_min',
 '1J_ex1_dist_from_first_std',
 '1J_ex1_dist_from_first_max_min_diff',
 '1J_ex1_Angle_0_1_mean',
 '1J_ex1_AverageBondAngle_max',
 '1J_ex1_dist_1_std',
 '1J_ex1_SmallestBondAngle_max',
 '1J_ex1_dist_1_min',
 '1J_ex1_dist_0_min',
 '1J_ex1_dist_0_std',
 '1J_ex1_dist_from_first_min',
 '1J_ex1_cos2T_F_L_EX1_mean',
 '1J_ex1_dist_from_first_max',
 '1J_ex1_dist_0_mean',
 '1J_ex1_dist_0_max_min_diff',
 '1J_ex1_cos2T_F_L_EX1_std',
 '1J_ex1_dist_1_max_min_diff',
 '1J_ex1_dist_from_first_mean',
 '1J_ex1_angle_fromEx1_max',
 '1J_ex1_dist_1_mean',
 '1J_ex1_Angle_0_1_std',
 '1J_ex1_angle_fromEx1_max_min_diff']

In [19]:
importance_df["ratio"] = importance_df["ave"]/importance_df["ave"].sum()

In [20]:
importance_df

fold,feature,1,2,3,4,5,ave,ratio
0,1J1st_AverageBondAngle,10115884918.28777,10104623380.19084,10077264638.4704,10084160112.46832,10080560787.03906,10092498767.29128,0.91468
1,dist_C_0_x,280120014.54616,279833531.23166,279593835.31282,279821082.72638,279872226.20915,279848138.00524,0.02536
2,molecule_type_dist_std_diff,115105961.16502,114715012.15516,114084182.62603,114797520.95381,114808629.88491,114702261.35698,0.0104
3,eem2015ha_1,102088081.00437,100064969.96234,100210704.75236,66487691.59089,99173690.76682,93605027.61536,0.00848
4,dist_to_type_mean,81244121.25743,89462339.00229,77507890.13287,90773896.35203,90054689.16886,85808587.1827,0.00778
5,1Jlast_GetPartialCharge,46833816.37383,47060510.64718,74688681.82621,67855604.92062,61312015.0321,59550125.75999,0.0054
6,mmff94_0,45796584.1604,46252482.76216,46347976.04719,46407730.69059,43156997.36417,45592354.2049,0.00413
7,2J2nd_AverageBondAngle,41803603.32371,41356164.24348,40960581.55232,41677370.80667,42094992.1251,41578542.41026,0.00377
8,1J1st_AveSmallestBondAngle_diff,23456526.9718,33096529.84937,37730051.26893,25035991.40652,23994268.74588,28662673.6485,0.0026
9,cosT,22968294.08123,22759577.35705,22757029.44653,22737032.08236,22601136.37542,22764613.86852,0.00206


In [22]:
importance_df_y = importance_df[importance_df.feature.isin(yiemon_HnJ)]

In [26]:
importance_df_y_high = importance_df_y[importance_df_y.ratio >= 0.00005]
importance_df_y_high

fold,feature,1,2,3,4,5,ave,ratio
13,mean_dist_C_from2nd,2219107.8072,1082870.4141,12345291.11046,5667477.4708,12604486.16454,6783846.59342,0.00061
17,mean_angle_C_from2nd,4464357.23324,4365691.1972,5100971.37583,4677631.60805,6538590.9395,5029448.47077,0.00046
36,d_O_from2nd_ratio_0,1745468.21043,1250039.08453,1385857.7582,1378011.21881,1703789.89275,1492633.23294,0.00014
39,mean_dist_ratio_C_from2nd,1174243.71498,1314657.43831,1302007.18286,1324328.27011,1197997.00485,1262646.72222,0.00011
40,mean_dist_ratio_O_from1st,2685933.43542,241324.03499,236974.91415,2122986.04824,918808.59357,1241205.40528,0.00011
41,mean_dist_ratio_O_from2nd,548789.525,2173530.02903,1247912.36466,954880.55809,1167358.27827,1218494.15101,0.00011
42,d_O_from1st_ratio_0,227104.04968,1299200.63347,2450670.81099,631376.41198,1346113.18302,1190893.01783,0.00011
56,mean_angle_O_from2nd,505996.26947,495110.14479,1123202.9626,889573.24679,806021.17185,763980.7591,7e-05
65,d_O_from1st_0,542893.52662,407577.05864,522879.46225,709298.34846,801393.8619,596808.45157,5e-05
67,mean_dist_C_from1st,570900.9896,577426.49378,561287.75217,666204.12254,450796.57512,565323.18664,5e-05


In [28]:
importance_df_y_high.feature.tolist()


['mean_dist_C_from2nd',
 'mean_angle_C_from2nd',
 'd_O_from2nd_ratio_0',
 'mean_dist_ratio_C_from2nd',
 'mean_dist_ratio_O_from1st',
 'mean_dist_ratio_O_from2nd',
 'd_O_from1st_ratio_0',
 'mean_angle_O_from2nd',
 'd_O_from1st_0',
 'mean_dist_C_from1st']

In [18]:
for c in pd.read_csv("/Users/kenichi.matsui/Downloads/rdkit_train.csv").columns:
    print(c)

Unnamed: 0
id
a1_degree
a1_hybridization
a1_inring
a1_inring3
a1_inring4
a1_inring5
a1_inring6
a1_inring7
a1_inring8
a1_nb_h
a1_nb_o
a1_nb_c
a1_nb_n
a1_nb_na
a0_nb_degree
a0_nb_hybridization
a0_nb_inring
a0_nb_inring3
a0_nb_inring4
a0_nb_inring5
a0_nb_inring6
a0_nb_inring7
a0_nb_inring8
a0_nb_nb_h
a0_nb_nb_o
a0_nb_nb_c
a0_nb_nb_n
a0_nb_nb_na
x_a0_nb
y_a0_nb
z_a0_nb
a1_nb_degree
a1_nb_hybridization
a1_nb_inring
a1_nb_inring3
a1_nb_inring4
a1_nb_inring5
a1_nb_inring6
a1_nb_inring7
a1_nb_inring8
a1_nb_nb_h
a1_nb_nb_o
a1_nb_nb_c
a1_nb_nb_n
a1_nb_nb_na
x_a1_nb
y_a1_nb
z_a1_nb
dist_to_type_mean
