In [1]:
# 基本ライブラリ
import pandas as pd
import pandas.io.sql as psql
import numpy as np
import numpy.random as rd
import gc
import multiprocessing as mp
import os
import sys
import pickle
from collections import defaultdict
from glob import glob
import math
from datetime import datetime as dt
from pathlib import Path
import scipy.stats as st
import re
import shutil
from tqdm import tqdm_notebook as tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # 秒ut(10桁) ⇒ 日付

# グラフ描画系
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc

from matplotlib import animation as ani
from IPython.display import Image

plt.rcParams["patch.force_edgecolor"] = True
#rc('text', usetex=True)
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

#カラム内の文字数。デフォルトは50
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
#
pd.options.display.float_format = '{:,.5f}'.format

%matplotlib inline

In [2]:

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold

from sklearn import metrics
import json

import warnings
warnings.filterwarnings("ignore")


sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle
from lib.utils import one_hot_encoder, apply_agg, multi_combine_categorical_feature
from lib.utils import import_data, get_split_indexer 

In [3]:
# train = pd.read_csv('../input/train.csv')
# test = pd.read_csv('../input/test.csv')
# sub = pd.read_csv('../input/sample_submission.csv')
# structures = pd.read_csv('../input/structures.csv')

In [4]:

def map_atom_info(df_1, df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df


def make_features(df):
    df['dx']=df['x_1']-df['x_0']
    df['dy']=df['y_1']-df['y_0']
    df['dz']=df['z_1']-df['z_0']
    df['distance']=(df['dx']**2+df['dy']**2+df['dz']**2)**(1/2)
    return df

In [5]:
def feat(df):
    df_temp=df.loc[:,["molecule_name","atom_index_0","atom_index_1","distance","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp=pd.concat((df_temp, df_temp_),axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["distance"]]

    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'distance': 'distance_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})
    return df_temp 

def add_cos_features(df):
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+
                      (df['y_0']-df['y_closest_0'])**2+
                      (df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["distance"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["distance"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["distance"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    return df
    

In [6]:
df_train=pd.read_csv('../input/train.csv')
n_train = df_train.shape[0]
print(f"n_train: {n_train}")
df_test=pd.read_csv('../input/test.csv')
n_test = df_test.shape[0]
print(f"n_test: {n_test}")

df_struct=pd.read_csv('../input/structures.csv')


for atom_idx in [0,1]:
    df_train = map_atom_info(df_train, df_struct, atom_idx)
    df_test  = map_atom_info(df_test, df_struct, atom_idx)
    df_train = df_train.rename(columns={'atom': f'atom_{atom_idx}',
                                        'x': f'x_{atom_idx}',
                                        'y': f'y_{atom_idx}',
                                        'z': f'z_{atom_idx}'})
    df_test = df_test.rename(columns={'atom': f'atom_{atom_idx}',
                                        'x': f'x_{atom_idx}',
                                        'y': f'y_{atom_idx}',
                                        'z': f'z_{atom_idx}'})
    
df_train = make_features(df_train)
assert n_train == df_train.shape[0], f"{n_train} {df_train.shape[0]}"
df_test  = make_features(df_test)
assert n_test == df_test.shape[0], f"{n_test} {df_test.shape[0]}"

n_train: 4658147
n_test: 2505542


In [7]:
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dx,dy,dz,distance
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.00603,0.00198,C,-0.0127,1.0858,0.008,-0.01485,1.09184,0.00602,1.09195
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.00603,0.00198,H,1.01173,1.46375,0.00028,1.00958,1.46978,-0.0017,1.78312
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.00603,0.00198,H,-0.54082,1.44753,-0.87664,-0.54297,1.45356,-0.87862,1.78315
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.00603,0.00198,H,-0.52381,1.43793,0.9064,-0.52596,1.44396,0.90442,1.78316
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.01173,1.46375,0.00028,C,-0.0127,1.0858,0.008,-1.02443,-0.37795,0.00772,1.09195


In [8]:
df_test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dx,dy,dz,distance
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.66164,0.0,1.0,C,0.59954,0.0,1.0,2.26118,0.0,0.0,2.26118
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.66164,0.0,1.0,C,-0.59954,0.0,1.0,1.0621,0.0,0.0,1.0621
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.66164,0.0,1.0,H,1.66164,0.0,1.0,3.32328,0.0,0.0,3.32328
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.66164,0.0,1.0,C,0.59954,0.0,1.0,-1.0621,0.0,0.0,1.0621
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.66164,0.0,1.0,C,-0.59954,0.0,1.0,-2.26118,0.0,0.0,2.26118


In [9]:
df_train.shape

(4658147, 18)

In [10]:
df_train_ = feat(df_train)
df_test_ = feat(df_test)

In [11]:
df_train_.head()

Unnamed: 0,atom_index,atom_index_closest,distance_closest,molecule_name,x_closest,y_closest,z_closest
0,1,0,1.09195,dsgdb9nsd_000001,-0.0127,1.0858,0.008
4,2,0,1.09195,dsgdb9nsd_000001,-0.0127,1.0858,0.008
7,3,0,1.09195,dsgdb9nsd_000001,-0.0127,1.0858,0.008
9,4,0,1.09195,dsgdb9nsd_000001,-0.0127,1.0858,0.008
10,1,0,1.01719,dsgdb9nsd_000002,-0.04043,1.02411,0.06256


In [12]:
for atom_idx in [0,1]:
    df_train = map_atom_info(df_train, df_train_, atom_idx)
    df_train = df_train.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                        'distance_closest': f'distance_closest_{atom_idx}',
                                        'x_closest': f'x_closest_{atom_idx}',
                                        'y_closest': f'y_closest_{atom_idx}',
                                        'z_closest': f'z_closest_{atom_idx}'})
    
    df_test = map_atom_info(df_test, df_test_, atom_idx)
    df_test = df_test.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                        'distance_closest': f'distance_closest_{atom_idx}',
                                        'x_closest': f'x_closest_{atom_idx}',
                                        'y_closest': f'y_closest_{atom_idx}',
                                        'z_closest': f'z_closest_{atom_idx}'})
    

#assert n_train == df_train.shape[0], f"{n_train} {df_train.shape[0]}"
#assert n_test == df_test.shape[0], f"{n_test} {df_test.shape[0]}"

In [13]:
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dx,dy,dz,distance,atom_index_closest_0,distance_closest_0,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,distance_closest_1,x_closest_1,y_closest_1,z_closest_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.00603,0.00198,C,-0.0127,1.0858,0.008,-0.01485,1.09184,0.00602,1.09195,0,1.09195,-0.0127,1.0858,0.008,3,1.09195,-0.54082,1.44753,-0.87664
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.00603,0.00198,H,1.01173,1.46375,0.00028,1.00958,1.46978,-0.0017,1.78312,0,1.09195,-0.0127,1.0858,0.008,0,1.09195,-0.0127,1.0858,0.008
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.00603,0.00198,H,-0.54082,1.44753,-0.87664,-0.54297,1.45356,-0.87862,1.78315,0,1.09195,-0.0127,1.0858,0.008,0,1.09195,-0.0127,1.0858,0.008
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.00603,0.00198,H,-0.52381,1.43793,0.9064,-0.52596,1.44396,0.90442,1.78316,0,1.09195,-0.0127,1.0858,0.008,0,1.09195,-0.0127,1.0858,0.008
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.01173,1.46375,0.00028,C,-0.0127,1.0858,0.008,-1.02443,-0.37795,0.00772,1.09195,0,1.09195,-0.0127,1.0858,0.008,3,1.09195,-0.54082,1.44753,-0.87664


In [14]:
cnt = df_train.id.value_counts()
cnt.head(10)

1871046    2
1871043    2
1871035    2
1871037    2
1871042    2
1871045    2
1871039    2
4230468    1
46403      1
4414782    1
Name: id, dtype: int64

In [15]:
duplicate_ids = cnt[cnt==2].index.values
reduce_duplicates_df = df_train[df_train.id.isin(duplicate_ids)].groupby("id").first()
df_train = pd.concat([df_train[~df_train.id.isin(duplicate_ids)], reduce_duplicates_df], axis=0)

In [16]:
df_train = add_cos_features(df_train)
df_test  = add_cos_features(df_test)

In [17]:
df_train.id.unique()[-100:]

array([4658048., 4658049., 4658050., 4658051., 4658052., 4658053.,
       4658054., 4658055., 4658056., 4658057., 4658058., 4658059.,
       4658060., 4658061., 4658062., 4658063., 4658064., 4658065.,
       4658066., 4658067., 4658068., 4658069., 4658070., 4658071.,
       4658072., 4658073., 4658074., 4658075., 4658076., 4658077.,
       4658078., 4658079., 4658080., 4658081., 4658082., 4658083.,
       4658084., 4658085., 4658086., 4658087., 4658088., 4658089.,
       4658090., 4658091., 4658092., 4658093., 4658094., 4658095.,
       4658096., 4658097., 4658098., 4658099., 4658100., 4658101.,
       4658102., 4658103., 4658104., 4658105., 4658106., 4658107.,
       4658108., 4658109., 4658110., 4658111., 4658112., 4658113.,
       4658114., 4658115., 4658116., 4658117., 4658118., 4658119.,
       4658120., 4658121., 4658122., 4658123., 4658124., 4658125.,
       4658126., 4658127., 4658128., 4658129., 4658130., 4658131.,
       4658132., 4658133., 4658134., 4658135., 4658136., 46581

In [18]:
df_train.head(100)

Unnamed: 0,atom_0,atom_1,atom_index_0,atom_index_1,atom_index_closest_0,atom_index_closest_1,distance,distance_closest_0,distance_closest_1,dx,dy,dz,id,molecule_name,scalar_coupling_constant,type,x_0,x_1,x_closest_0,x_closest_1,y_0,y_1,y_closest_0,y_closest_1,z_0,z_1,z_closest_0,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,H,C,1,0,0,3,1.09195,1.09195,1.09195,-0.01485,1.09184,0.00602,0.0,dsgdb9nsd_000001,84.8076,1JHC,0.00215,-0.0127,-0.0127,-0.54082,-0.00603,1.0858,1.0858,1.44753,0.00198,0.008,0.008,-0.87664,1.09195,1.09195,0.33333,-1.0,-0.33333
1,H,H,1,2,0,0,1.78312,1.09195,1.09195,1.00958,1.46978,-0.0017,1.0,dsgdb9nsd_000001,-11.257,2JHH,0.00215,1.01173,-0.0127,-0.0127,-0.00603,1.46375,1.0858,1.0858,0.00198,0.00028,0.008,0.008,1.09195,1.09195,-0.33329,-0.81648,0.81648
2,H,H,1,3,0,0,1.78315,1.09195,1.09195,-0.54297,1.45356,-0.87862,2.0,dsgdb9nsd_000001,-11.2548,2JHH,0.00215,-0.54082,-0.0127,-0.0127,-0.00603,1.44753,1.0858,1.0858,0.00198,-0.87664,0.008,0.008,1.09195,1.09195,-0.33333,-0.8165,0.8165
3,H,H,1,4,0,0,1.78316,1.09195,1.09195,-0.52596,1.44396,0.90442,3.0,dsgdb9nsd_000001,-11.2543,2JHH,0.00215,-0.52381,-0.0127,-0.0127,-0.00603,1.43793,1.0858,1.0858,0.00198,0.9064,0.008,0.008,1.09195,1.09195,-0.33335,-0.8165,0.8165
4,H,C,2,0,0,3,1.09195,1.09195,1.09195,-1.02443,-0.37795,0.00772,4.0,dsgdb9nsd_000001,84.8074,1JHC,1.01173,-0.0127,-0.0127,-0.54082,1.46375,1.0858,1.0858,1.44753,0.00028,0.008,0.008,-0.87664,1.09195,1.09195,0.33335,-1.0,-0.33335
5,H,H,2,3,0,0,1.78316,1.09195,1.09195,-1.55255,-0.01622,-0.87692,5.0,dsgdb9nsd_000001,-11.2541,2JHH,1.01173,-0.54082,-0.0127,-0.0127,1.46375,1.44753,1.0858,1.0858,0.00028,-0.87664,0.008,0.008,1.09195,1.09195,-0.33335,-0.8165,0.8165
6,H,H,2,4,0,0,1.78315,1.09195,1.09195,-1.53554,-0.02582,0.90612,6.0,dsgdb9nsd_000001,-11.2548,2JHH,1.01173,-0.52381,-0.0127,-0.0127,1.46375,1.43793,1.0858,1.0858,0.00028,0.9064,0.008,0.008,1.09195,1.09195,-0.33334,-0.8165,0.8165
7,H,C,3,0,0,3,1.09195,1.09195,1.09195,0.52812,-0.36172,0.88464,7.0,dsgdb9nsd_000001,84.8093,1JHC,-0.54082,-0.0127,-0.0127,-0.54082,1.44753,1.0858,1.0858,1.44753,-0.87664,0.008,0.008,-0.87664,1.09195,1.09195,-1.0,-1.0,1.0
8,H,H,3,4,0,0,1.78315,1.09195,1.09195,0.017,-0.00959,1.78304,8.0,dsgdb9nsd_000001,-11.2543,2JHH,-0.54082,-0.52381,-0.0127,-0.0127,1.44753,1.43793,1.0858,1.0858,-0.87664,0.9064,0.008,0.008,1.09195,1.09195,-0.33334,-0.8165,0.8165
9,H,C,4,0,0,3,1.09195,1.09195,1.09195,0.51112,-0.35213,-0.8984,9.0,dsgdb9nsd_000001,84.8095,1JHC,-0.52381,-0.0127,-0.0127,-0.54082,1.43793,1.0858,1.0858,1.44753,0.9064,0.008,0.008,-0.87664,1.09195,1.09195,0.33334,-1.0,-0.33334


In [19]:
df_test.head(100)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dx,dy,dz,distance,atom_index_closest_0,distance_closest_0,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,distance_closest_1,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.66164,0.0,1.0,C,0.59954,0.0,1.0,2.26118,0.0,0.0,2.26118,1,1.0621,-0.59954,0.0,1.0,3,1.0621,1.66164,0.0,1.0,1.0621,1.0621,1.0,-1.0,-1.0
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.66164,0.0,1.0,C,-0.59954,0.0,1.0,1.0621,0.0,0.0,1.0621,1,1.0621,-0.59954,0.0,1.0,2,1.0621,-1.66164,0.0,1.0,1.0621,1.0621,-1.0,-1.0,1.0
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.66164,0.0,1.0,H,1.66164,0.0,1.0,3.32328,0.0,0.0,3.32328,1,1.0621,-0.59954,0.0,1.0,0,1.0621,0.59954,0.0,1.0,1.0621,1.0621,-1.0,-1.0,1.0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.66164,0.0,1.0,C,0.59954,0.0,1.0,-1.0621,0.0,0.0,1.0621,0,1.0621,0.59954,0.0,1.0,3,1.0621,1.66164,0.0,1.0,1.0621,1.0621,-1.0,-1.0,1.0
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.66164,0.0,1.0,C,-0.59954,0.0,1.0,-2.26118,0.0,0.0,2.26118,0,1.0621,0.59954,0.0,1.0,2,1.0621,-1.66164,0.0,1.0,1.0621,1.0621,1.0,-1.0,-1.0
5,4658152,dsgdb9nsd_000015,3,0,1JHC,H,1.00528,1.81016,0.00466,C,-0.01482,1.39241,0.00567,-1.02011,-0.41775,0.00101,1.10233,0,1.10233,-0.01482,1.39241,0.00567,5,1.09285,-0.53003,1.72292,0.91102,1.10233,1.09285,0.32242,-1.0,-0.32242
6,4658153,dsgdb9nsd_000015,3,2,3JHC,H,1.00528,1.81016,0.00466,C,0.63795,-0.5533,-1.11358,-0.36733,-2.36345,-1.11824,2.64032,0,1.10233,-0.01482,1.39241,0.00567,8,1.09285,0.59959,-1.6418,-1.02408,1.10233,1.09285,0.41001,-0.46759,-0.86177
7,4658154,dsgdb9nsd_000015,3,4,2JHH,H,1.00528,1.81016,0.00466,H,-0.5469,1.79344,-0.87251,-1.55218,-0.01672,-0.87717,1.78296,0,1.10233,-0.01482,1.39241,0.00567,0,1.10233,-0.01482,1.39241,0.00567,1.10233,1.10233,-0.30808,-0.80873,0.80873
8,4658155,dsgdb9nsd_000015,3,5,2JHH,H,1.00528,1.81016,0.00466,H,-0.53003,1.72292,0.91102,-1.53531,-0.08724,0.90636,1.78502,0,1.10233,-0.01482,1.39241,0.00567,0,1.09285,-0.01482,1.39241,0.00567,1.10233,1.09285,-0.32242,-0.81494,0.81135
9,4658156,dsgdb9nsd_000015,4,0,1JHC,H,-0.5469,1.79344,-0.87251,C,-0.01482,1.39241,0.00567,0.53207,-0.40102,0.87818,1.10233,0,1.10233,-0.01482,1.39241,0.00567,5,1.09285,-0.53003,1.72292,0.91102,1.10233,1.09285,0.3224,-1.0,-0.3224


In [20]:
df_train.shape, df_test.shape

((4658147, 33), (2505542, 32))

In [21]:
df_train.columns = [f"f003:{c}" for c in df_train.columns] 
df_test.columns = [f"f003:{c}" for c in df_test.columns]
df_train.rename({"f003:id":"id"}, axis=1, inplace=True)
df_test.rename({"f003:id":"id"}, axis=1, inplace=True)

In [22]:
df_train.head()

Unnamed: 0,f003:atom_0,f003:atom_1,f003:atom_index_0,f003:atom_index_1,f003:atom_index_closest_0,f003:atom_index_closest_1,f003:distance,f003:distance_closest_0,f003:distance_closest_1,f003:dx,f003:dy,f003:dz,id,f003:molecule_name,f003:scalar_coupling_constant,f003:type,f003:x_0,f003:x_1,f003:x_closest_0,f003:x_closest_1,f003:y_0,f003:y_1,f003:y_closest_0,f003:y_closest_1,f003:z_0,f003:z_1,f003:z_closest_0,f003:z_closest_1,f003:distance_0,f003:distance_1,f003:cos_0_1,f003:cos_0,f003:cos_1
0,H,C,1,0,0,3,1.09195,1.09195,1.09195,-0.01485,1.09184,0.00602,0.0,dsgdb9nsd_000001,84.8076,1JHC,0.00215,-0.0127,-0.0127,-0.54082,-0.00603,1.0858,1.0858,1.44753,0.00198,0.008,0.008,-0.87664,1.09195,1.09195,0.33333,-1.0,-0.33333
1,H,H,1,2,0,0,1.78312,1.09195,1.09195,1.00958,1.46978,-0.0017,1.0,dsgdb9nsd_000001,-11.257,2JHH,0.00215,1.01173,-0.0127,-0.0127,-0.00603,1.46375,1.0858,1.0858,0.00198,0.00028,0.008,0.008,1.09195,1.09195,-0.33329,-0.81648,0.81648
2,H,H,1,3,0,0,1.78315,1.09195,1.09195,-0.54297,1.45356,-0.87862,2.0,dsgdb9nsd_000001,-11.2548,2JHH,0.00215,-0.54082,-0.0127,-0.0127,-0.00603,1.44753,1.0858,1.0858,0.00198,-0.87664,0.008,0.008,1.09195,1.09195,-0.33333,-0.8165,0.8165
3,H,H,1,4,0,0,1.78316,1.09195,1.09195,-0.52596,1.44396,0.90442,3.0,dsgdb9nsd_000001,-11.2543,2JHH,0.00215,-0.52381,-0.0127,-0.0127,-0.00603,1.43793,1.0858,1.0858,0.00198,0.9064,0.008,0.008,1.09195,1.09195,-0.33335,-0.8165,0.8165
4,H,C,2,0,0,3,1.09195,1.09195,1.09195,-1.02443,-0.37795,0.00772,4.0,dsgdb9nsd_000001,84.8074,1JHC,1.01173,-0.0127,-0.0127,-0.54082,1.46375,1.0858,1.0858,1.44753,0.00028,0.008,0.008,-0.87664,1.09195,1.09195,0.33335,-1.0,-0.33335


In [23]:
df_test.head()

Unnamed: 0,id,f003:molecule_name,f003:atom_index_0,f003:atom_index_1,f003:type,f003:atom_0,f003:x_0,f003:y_0,f003:z_0,f003:atom_1,f003:x_1,f003:y_1,f003:z_1,f003:dx,f003:dy,f003:dz,f003:distance,f003:atom_index_closest_0,f003:distance_closest_0,f003:x_closest_0,f003:y_closest_0,f003:z_closest_0,f003:atom_index_closest_1,f003:distance_closest_1,f003:x_closest_1,f003:y_closest_1,f003:z_closest_1,f003:distance_0,f003:distance_1,f003:cos_0_1,f003:cos_0,f003:cos_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.66164,0.0,1.0,C,0.59954,0.0,1.0,2.26118,0.0,0.0,2.26118,1,1.0621,-0.59954,0.0,1.0,3,1.0621,1.66164,0.0,1.0,1.0621,1.0621,1.0,-1.0,-1.0
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.66164,0.0,1.0,C,-0.59954,0.0,1.0,1.0621,0.0,0.0,1.0621,1,1.0621,-0.59954,0.0,1.0,2,1.0621,-1.66164,0.0,1.0,1.0621,1.0621,-1.0,-1.0,1.0
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.66164,0.0,1.0,H,1.66164,0.0,1.0,3.32328,0.0,0.0,3.32328,1,1.0621,-0.59954,0.0,1.0,0,1.0621,0.59954,0.0,1.0,1.0621,1.0621,-1.0,-1.0,1.0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.66164,0.0,1.0,C,0.59954,0.0,1.0,-1.0621,0.0,0.0,1.0621,0,1.0621,0.59954,0.0,1.0,3,1.0621,1.66164,0.0,1.0,1.0621,1.0621,-1.0,-1.0,1.0
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.66164,0.0,1.0,C,-0.59954,0.0,1.0,-2.26118,0.0,0.0,2.26118,0,1.0621,0.59954,0.0,1.0,2,1.0621,-1.66164,0.0,1.0,1.0621,1.0621,1.0,-1.0,-1.0


In [24]:
# idが抜けていたところを埋める
train_org = pd.read_csv('../input/train.csv')
train_org_s = train_org[train_org.molecule_name=="dsgdb9nsd_059818"]
train_org_s

for i, d in df_train[df_train.id.isna()].iterrows():
    #display(d)
    idx0 = d["f003:atom_index_0"]
    idx1 = d["f003:atom_index_1"]
    id_ = train_org_s.query(f"atom_index_0 == {idx0} and atom_index_1=={idx1}").iloc[0].id
    df_train.loc[d.name, "id"] = id_
    
df_train[df_train.id.isna()]

Unnamed: 0,f003:atom_0,f003:atom_1,f003:atom_index_0,f003:atom_index_1,f003:atom_index_closest_0,f003:atom_index_closest_1,f003:distance,f003:distance_closest_0,f003:distance_closest_1,f003:dx,f003:dy,f003:dz,id,f003:molecule_name,f003:scalar_coupling_constant,f003:type,f003:x_0,f003:x_1,f003:x_closest_0,f003:x_closest_1,f003:y_0,f003:y_1,f003:y_closest_0,f003:y_closest_1,f003:z_0,f003:z_1,f003:z_closest_0,f003:z_closest_1,f003:distance_0,f003:distance_1,f003:cos_0_1,f003:cos_0,f003:cos_1


In [25]:
df_train["id"]= df_train.id.astype(int)

In [26]:
df_train.reset_index(drop=True, inplace=True)

In [27]:
df_train.loc[1871035, "id"] = 1871036
df_train.loc[1871037, "id"] = 1871040
df_train.loc[4658141, "id"] = 1871037
df_train.loc[4658144, "id"] = 1871043

In [28]:
DATA_VERSION = "v001"
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)

In [29]:
to_pickle(save_path/"train_003.df.pkl", df_train[["id", "f003:cos_0_1","f003:cos_0","f003:cos_1"]])
to_pickle(save_path/"test_003.df.pkl", df_test[["id", "f003:cos_0_1","f003:cos_0","f003:cos_1"]])

In [30]:
df_train[["id", "f003:cos_0_1","f003:cos_0","f003:cos_1"]].head()

Unnamed: 0,id,f003:cos_0_1,f003:cos_0,f003:cos_1
0,0,0.33333,-1.0,-0.33333
1,1,-0.33329,-0.81648,0.81648
2,2,-0.33333,-0.8165,0.8165
3,3,-0.33335,-0.8165,0.8165
4,4,0.33335,-1.0,-0.33335


In [31]:
df_test[["id", "f003:cos_0_1","f003:cos_0","f003:cos_1"]].head()

Unnamed: 0,id,f003:cos_0_1,f003:cos_0,f003:cos_1
0,4658147,1.0,-1.0,-1.0
1,4658148,-1.0,-1.0,1.0
2,4658149,-1.0,-1.0,1.0
3,4658150,-1.0,-1.0,1.0
4,4658151,1.0,-1.0,-1.0


In [32]:
df_train.shape, df_test.shape

((4658147, 33), (2505542, 32))

In [33]:
df_train.id.value_counts().head(30)

4196351    1
4255048    1
4242754    1
46403      1
4230468    1
34117      1
4234566    1
38215      1
58697      1
4238656    1
4259146    1
62795      1
4246860    1
50509      1
4250958    1
54607      1
42305      1
218431     1
9553       1
4398390    1
4402480    1
206129     1
4406578    1
210227     1
4394292    1
197941     1
202039     1
4414782    1
4418872    1
222521     1
Name: id, dtype: int64

In [34]:
# 1871035	1871036 15	11
# 1871037	1871040 15	18
# 4658141	1871037 15	14
# 4658144	1871043 16	17

In [35]:
# df_train.loc[1871035, "id"] = 1871036
# df_train.loc[1871037, "id"] = 1871040
# df_train.loc[4658141, "id"] = 1871037
# df_train.loc[4658144, "id"] = 1871043

In [36]:
# train_org_s