In [1]:
# 基本ライブラリ
import pandas as pd
import pandas.io.sql as psql
import numpy as np
import numpy.random as rd
import gc
import multiprocessing as mp
import os
import sys
import pickle
from collections import defaultdict
from glob import glob
import math
from datetime import datetime as dt
from pathlib import Path
import scipy.stats as st
import re
import shutil
from tqdm import tqdm_notebook as tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # 秒ut(10桁) ⇒ 日付

# グラフ描画系
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc

from matplotlib import animation as ani
from IPython.display import Image

plt.rcParams["patch.force_edgecolor"] = True
#rc('text', usetex=True)
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

#カラム内の文字数。デフォルトは50
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
#
pd.options.display.float_format = '{:,.5f}'.format

%matplotlib inline


from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold

from sklearn import metrics
import json

import warnings
warnings.filterwarnings("ignore")


sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle
from lib.utils import one_hot_encoder, apply_agg, multi_combine_categorical_feature
from lib.utils import import_data, get_split_indexer 

In [2]:

def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sub = pd.read_csv('../input/sample_submission.csv')
structures = pd.read_csv('../input/structures.csv')

In [4]:
print("train 1")
train = map_atom_info(train, 0)
print("train 2")
train = map_atom_info(train, 1)
print("train 3")

print("test 1")
test = map_atom_info(test, 0)
print("test 2")
test = map_atom_info(test, 1)
print("test 3")

train 1
train 2
train 3
test 1
test 2
test 3


In [5]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.00603,0.00198,C,-0.0127,1.0858,0.008
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.00603,0.00198,H,1.01173,1.46375,0.00028
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.00603,0.00198,H,-0.54082,1.44753,-0.87664
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.00603,0.00198,H,-0.52381,1.43793,0.9064
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.01173,1.46375,0.00028,C,-0.0127,1.0858,0.008


In [6]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.66164,0.0,1.0,C,0.59954,0.0,1.0
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.66164,0.0,1.0,C,-0.59954,0.0,1.0
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.66164,0.0,1.0,H,1.66164,0.0,1.0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.66164,0.0,1.0,C,0.59954,0.0,1.0
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.66164,0.0,1.0,C,-0.59954,0.0,1.0


In [7]:
def angle_feat(df):
    df_feat = pd.DataFrame({"id":df.id.values}, index=df.index.values)
    for axis in ["x", "y", "z"]:
        df_feat[f"{axis}_diff"] = df[f"{axis}_0"] - df[f"{axis}_1"]

    df_feat["diff_norm"] = (df_feat.x_diff**2 + df_feat.y_diff**2 + df_feat.z_diff**2)**0.5
    df_feat["zero_norm"] = (df.x_0**2 + df.y_0**2 + df.z_0**2)**0.5

    for axis in ["x", "y", "z"]:
        df_feat[f"{axis}_diff"] = df_feat[f"{axis}_diff"].values / df_feat["diff_norm"].values
        df_feat[f"{axis}_0"] = df[f"{axis}_0"].values / df_feat["zero_norm"].values

    df_feat["f004:angle"] = df_feat.x_diff*df_feat.x_0 + df_feat.x_diff*df_feat.y_0 + df_feat.x_diff*df_feat.z_0
    df_feat["f004:angle_abs"] = np.abs(df_feat["f004:angle"])
    return df_feat[["id", "f004:angle", "f004:angle_abs"]]

In [8]:
angle_df_train = angle_feat(train)
angle_df_test  = angle_feat(test)

In [9]:
angle_df_train.head()

Unnamed: 0,id,f004:angle,f004:angle_abs
0,0,-0.00387,0.00387
1,1,0.16094,0.16094
2,2,-0.08655,0.08655
3,3,-0.08384,0.08384
4,4,1.30533,1.30533


In [10]:
angle_df_test.head()

Unnamed: 0,id,f004:angle,f004:angle_abs
0,4658147,0.34117,0.34117
1,4658148,0.34117,0.34117
2,4658149,0.34117,0.34117
3,4658150,1.37244,1.37244
4,4658151,1.37244,1.37244


In [11]:
train = train.merge(angle_df_train, on="id", how="left")
test = test.merge(angle_df_test, on="id", how="left")

In [14]:
def angle_additional(df):
    for agg in ['mean', 'min', 'max', 'std']:
        df[f'angle_{agg}'] = df.groupby('molecule_name')['f004:angle'].transform(agg)
        df[f'angle_abs_{agg}'] = df.groupby('molecule_name')['f004:angle_abs'].transform(agg)
    return df

In [15]:
angle_df_train["molecule_name"] = train["molecule_name"]
angle_df_test["molecule_name"]  = test["molecule_name"]

In [16]:
angle_additional_train = angle_additional(angle_df_train[["id", 'molecule_name', "f004:angle", "f004:angle_abs"]])
angle_additional_train.drop(["molecule_name","f004:angle","f004:angle_abs"], axis=1, inplace=True)
angle_additional_train.rename({c:f"f005:{c}" for c in angle_additional_train.columns if c!="id"},
                             axis=1, inplace=True)

angle_additional_test = angle_additional(angle_df_test[["id", 'molecule_name', "f004:angle", "f004:angle_abs"]])
angle_additional_test.drop(["molecule_name","f004:angle","f004:angle_abs"], axis=1, inplace=True)
angle_additional_test.rename({c:f"f005:{c}" for c in angle_additional_test.columns if c!="id"},
                             axis=1, inplace=True)

In [17]:
angle_additional_train.head()

Unnamed: 0,id,f005:angle_mean,f005:angle_abs_mean,f005:angle_min,f005:angle_abs_min,f005:angle_max,f005:angle_abs_max,f005:angle_std,f005:angle_abs_std
0,0,0.32142,0.45376,-0.4791,0.00016,1.30533,1.30533,0.65354,0.55958
1,1,0.32142,0.45376,-0.4791,0.00016,1.30533,1.30533,0.65354,0.55958
2,2,0.32142,0.45376,-0.4791,0.00016,1.30533,1.30533,0.65354,0.55958
3,3,0.32142,0.45376,-0.4791,0.00016,1.30533,1.30533,0.65354,0.55958
4,4,0.32142,0.45376,-0.4791,0.00016,1.30533,1.30533,0.65354,0.55958


In [18]:
angle_additional_test.head()

Unnamed: 0,id,f005:angle_mean,f005:angle_abs_mean,f005:angle_min,f005:angle_abs_min,f005:angle_max,f005:angle_abs_max,f005:angle_std,f005:angle_abs_std
0,4658147,0.75368,0.75368,0.34117,0.34117,1.37244,1.37244,0.56485,0.56485
1,4658148,0.75368,0.75368,0.34117,0.34117,1.37244,1.37244,0.56485,0.56485
2,4658149,0.75368,0.75368,0.34117,0.34117,1.37244,1.37244,0.56485,0.56485
3,4658150,0.75368,0.75368,0.34117,0.34117,1.37244,1.37244,0.56485,0.56485
4,4658151,0.75368,0.75368,0.34117,0.34117,1.37244,1.37244,0.56485,0.56485


In [19]:
angle_additional_train.shape, angle_additional_test.shape

((4658147, 9), (2505542, 9))

In [20]:
train.shape, test.shape

((4658147, 16), (2505542, 15))

In [21]:
DATA_VERSION = "v003"
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
to_pickle(save_path/"train_005.df.pkl", angle_additional_train)
to_pickle(save_path/"test_005.df.pkl", angle_additional_test)

In [22]:
angle_additional_test.columns.values.tolist()

['id',
 'f005:angle_mean',
 'f005:angle_abs_mean',
 'f005:angle_min',
 'f005:angle_abs_min',
 'f005:angle_max',
 'f005:angle_abs_max',
 'f005:angle_std',
 'f005:angle_abs_std']