In [1]:
# 基本ライブラリ
import pandas as pd
import pandas.io.sql as psql
import numpy as np
import numpy.random as rd
import gc
import multiprocessing as mp
import os
import sys
import pickle
from collections import defaultdict
from glob import glob
import math
from datetime import datetime as dt
from pathlib import Path
import scipy.stats as st
import re
import shutil
from tqdm import tqdm_notebook as tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # 秒ut(10桁) ⇒ 日付

# グラフ描画系
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc

from matplotlib import animation as ani
from IPython.display import Image

plt.rcParams["patch.force_edgecolor"] = True
#rc('text', usetex=True)
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

#カラム内の文字数。デフォルトは50
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
#
pd.options.display.float_format = '{:,.5f}'.format

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
DATA_VERSION = "v003"
TRIAL_NO = "059"
seed = 2069

In [15]:
path = f"../log/{DATA_VERSION}_{TRIAL_NO}/importance_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv"
importance_df = pd.read_csv(path, index_col=0)
importance_df = importance_df.pivot_table(values="importance", columns="fold", index="feature")
importance_df["ave"] = importance_df.mean(axis=1)
importance_df.sort_values("ave", inplace=True, ascending=False)
importance_df = importance_df.reset_index()

In [17]:
yiemon_HnJ = [
'angle_C_from2nd_0',
 'angle_C_from2nd_0_ratio',
 'angle_C_from2nd_1',
 'angle_C_from2nd_1_ratio',
 'angle_C_from2nd_2',
 'angle_C_from2nd_2_ratio',
 'angle_H_from2nd_0',
 'angle_H_from2nd_0_ratio',
 'angle_H_from2nd_1',
 'angle_H_from2nd_1_ratio',
 'angle_N_from2nd_0',
 'angle_N_from2nd_0_ratio',
 'angle_N_from2nd_1',
 'angle_N_from2nd_1_ratio',
 'angle_O_from2nd_0',
 'angle_O_from2nd_0_ratio',
 'angle_O_from2nd_1',
 'angle_O_from2nd_1_ratio',
 'count_dist_C_from1st',
 'count_dist_C_from2nd',
 'count_dist_H_from1st',
 'count_dist_H_from2nd',
 'count_dist_N_from1st',
 'count_dist_N_from2nd',
 'count_dist_O_from1st',
 'count_dist_O_from2nd',
 'd_C_from1st_0',
 'd_C_from1st_1',
 'd_C_from1st_2',
 'd_C_from1st_ratio_0',
 'd_C_from1st_ratio_1',
 'd_C_from1st_ratio_2',
 'd_C_from2nd_0',
 'd_C_from2nd_1',
 'd_C_from2nd_2',
 'd_C_from2nd_ratio_0',
 'd_C_from2nd_ratio_1',
 'd_C_from2nd_ratio_2',
 'd_H_from1st_0',
 'd_H_from1st_1',
 'd_H_from1st_ratio_0',
 'd_H_from1st_ratio_1',
 'd_H_from2nd_0',
 'd_H_from2nd_1',
 'd_H_from2nd_ratio_0',
 'd_H_from2nd_ratio_1',
 'd_N_from1st_0',
 'd_N_from1st_1',
 'd_N_from1st_ratio_0',
 'd_N_from1st_ratio_1',
 'd_N_from2nd_0',
 'd_N_from2nd_1',
 'd_N_from2nd_ratio_0',
 'd_N_from2nd_ratio_1',
 'd_O_from1st_0',
 'd_O_from1st_1',
 'd_O_from1st_ratio_0',
 'd_O_from1st_ratio_1',
 'd_O_from2nd_0',
 'd_O_from2nd_1',
 'd_O_from2nd_ratio_0',
 'd_O_from2nd_ratio_1',
 'eem_length2',
 'eem_x',
 'gasteiger_length2',
 'gasteiger_x',
 'mean_angle_C_from2nd',
 'mean_angle_H_from2nd',
 'mean_angle_N_from2nd',
 'mean_angle_O_from2nd',
 'mean_dist_C_from1st',
 'mean_dist_C_from2nd',
 'mean_dist_H_from1st',
 'mean_dist_H_from2nd',
 'mean_dist_N_from1st',
 'mean_dist_N_from2nd',
 'mean_dist_O_from1st',
 'mean_dist_O_from2nd',
 'mean_dist_ratio_C_from1st',
 'mean_dist_ratio_C_from2nd',
 'mean_dist_ratio_H_from1st',
 'mean_dist_ratio_H_from2nd',
 'mean_dist_ratio_N_from1st',
 'mean_dist_ratio_N_from2nd',
 'mean_dist_ratio_O_from1st',
 'mean_dist_ratio_O_from2nd',
 'mmff94_length2',
 'mmff94_x',
 'qeq_length2',
 'qeq_x',
 'std_angle_C_from2nd',
 'std_angle_H_from2nd',
 'std_angle_N_from2nd',
 'std_angle_O_from2nd',
 'std_dist_C_from1st',
 'std_dist_C_from2nd',
 'std_dist_H_from1st',
 'std_dist_H_from2nd',
 'std_dist_N_from1st',
 'std_dist_N_from2nd',
 'std_dist_O_from1st',
 'std_dist_O_from2nd']

In [19]:
importance_df["ratio"] = importance_df["ave"]/importance_df["ave"].sum()

In [20]:
importance_df

fold,feature,1,2,3,4,5,ave,ratio
0,1J1st_AverageBondAngle,10115884918.28777,10104623380.19084,10077264638.4704,10084160112.46832,10080560787.03906,10092498767.29128,0.91468
1,dist_C_0_x,280120014.54616,279833531.23166,279593835.31282,279821082.72638,279872226.20915,279848138.00524,0.02536
2,molecule_type_dist_std_diff,115105961.16502,114715012.15516,114084182.62603,114797520.95381,114808629.88491,114702261.35698,0.0104
3,eem2015ha_1,102088081.00437,100064969.96234,100210704.75236,66487691.59089,99173690.76682,93605027.61536,0.00848
4,dist_to_type_mean,81244121.25743,89462339.00229,77507890.13287,90773896.35203,90054689.16886,85808587.1827,0.00778
5,1Jlast_GetPartialCharge,46833816.37383,47060510.64718,74688681.82621,67855604.92062,61312015.0321,59550125.75999,0.0054
6,mmff94_0,45796584.1604,46252482.76216,46347976.04719,46407730.69059,43156997.36417,45592354.2049,0.00413
7,2J2nd_AverageBondAngle,41803603.32371,41356164.24348,40960581.55232,41677370.80667,42094992.1251,41578542.41026,0.00377
8,1J1st_AveSmallestBondAngle_diff,23456526.9718,33096529.84937,37730051.26893,25035991.40652,23994268.74588,28662673.6485,0.0026
9,cosT,22968294.08123,22759577.35705,22757029.44653,22737032.08236,22601136.37542,22764613.86852,0.00206


In [22]:
importance_df_y = importance_df[importance_df.feature.isin(yiemon_HnJ)]

In [26]:
importance_df_y_high = importance_df_y[importance_df_y.ratio >= 0.00005]
importance_df_y_high

fold,feature,1,2,3,4,5,ave,ratio
13,mean_dist_C_from2nd,2219107.8072,1082870.4141,12345291.11046,5667477.4708,12604486.16454,6783846.59342,0.00061
17,mean_angle_C_from2nd,4464357.23324,4365691.1972,5100971.37583,4677631.60805,6538590.9395,5029448.47077,0.00046
36,d_O_from2nd_ratio_0,1745468.21043,1250039.08453,1385857.7582,1378011.21881,1703789.89275,1492633.23294,0.00014
39,mean_dist_ratio_C_from2nd,1174243.71498,1314657.43831,1302007.18286,1324328.27011,1197997.00485,1262646.72222,0.00011
40,mean_dist_ratio_O_from1st,2685933.43542,241324.03499,236974.91415,2122986.04824,918808.59357,1241205.40528,0.00011
41,mean_dist_ratio_O_from2nd,548789.525,2173530.02903,1247912.36466,954880.55809,1167358.27827,1218494.15101,0.00011
42,d_O_from1st_ratio_0,227104.04968,1299200.63347,2450670.81099,631376.41198,1346113.18302,1190893.01783,0.00011
56,mean_angle_O_from2nd,505996.26947,495110.14479,1123202.9626,889573.24679,806021.17185,763980.7591,7e-05
65,d_O_from1st_0,542893.52662,407577.05864,522879.46225,709298.34846,801393.8619,596808.45157,5e-05
67,mean_dist_C_from1st,570900.9896,577426.49378,561287.75217,666204.12254,450796.57512,565323.18664,5e-05


In [28]:
importance_df_y_high.feature.tolist()


['mean_dist_C_from2nd',
 'mean_angle_C_from2nd',
 'd_O_from2nd_ratio_0',
 'mean_dist_ratio_C_from2nd',
 'mean_dist_ratio_O_from1st',
 'mean_dist_ratio_O_from2nd',
 'd_O_from1st_ratio_0',
 'mean_angle_O_from2nd',
 'd_O_from1st_0',
 'mean_dist_C_from1st']