
https://www.kaggle.com/criskiev/distance-is-all-you-need-lb-1-481

In [8]:
%matplotlib inline

import pandas as pd
import numpy as np

import math
import gc
import copy

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMRegressor

from IPython.display import display

In [2]:
DATA_PATH = '../input'
SUBMISSIONS_PATH = './'
# use atomic numbers to recode atomic names
# 原子番号
ATOMIC_NUMBERS = {
    'H': 1,
    'C': 6,
    'N': 7,
    'O': 8,
    'F': 9
}

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)

In [6]:
train_dtypes = {
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}
train_csv = pd.read_csv(f'{DATA_PATH}/train.csv', index_col='id', dtype=train_dtypes)
# molecue nameをindex番号に変更
train_csv['molecule_index'] = train_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
train_csv = train_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant']]
train_csv.head(10)

Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1,0,1JHC,84.807602
1,1,1,2,2JHH,-11.257
2,1,1,3,2JHH,-11.2548
3,1,1,4,2JHH,-11.2543
4,1,2,0,1JHC,84.807404
5,1,2,3,2JHH,-11.2541
6,1,2,4,2JHH,-11.2548
7,1,3,0,1JHC,84.809303
8,1,3,4,2JHH,-11.2543
9,1,4,0,1JHC,84.809502


In [7]:
submission_csv = pd.read_csv(f'{DATA_PATH}/sample_submission.csv', index_col='id')

In [9]:
test_csv = pd.read_csv(f'{DATA_PATH}/test.csv', index_col='id', dtype=train_dtypes)
test_csv['molecule_index'] = test_csv['molecule_name'].str.replace('dsgdb9nsd_', '').astype('int32')
test_csv = test_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'type']]
test_csv.head(10)

  mask |= (ar1 == a)


Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4658147,4,2,0,2JHC
4658148,4,2,1,1JHC
4658149,4,2,3,3JHH
4658150,4,3,0,1JHC
4658151,4,3,1,2JHC
4658152,15,3,0,1JHC
4658153,15,3,2,3JHC
4658154,15,3,4,2JHH
4658155,15,3,5,2JHH
4658156,15,4,0,1JHC


In [17]:
structures_csv = pd.read_csv(f'{DATA_PATH}/structures.csv', dtype=structures_dtypes)
structures_csv.molecule_name.str.replace('dsgdb9nsd_','').astype('int')

0          1     
1          1     
2          1     
3          1     
4          1     
5          2     
6          2     
7          2     
8          2     
9          3     
10         3     
11         3     
12         4     
13         4     
14         4     
15         4     
16         5     
17         5     
18         5     
19         7     
20         7     
21         7     
22         7     
23         7     
24         7     
25         7     
26         7     
27         8     
28         8     
29         8     
30         8     
31         8     
32         8     
33         9     
34         9     
35         9     
36         9     
37         9     
38         9     
39         9     
40         10    
41         10    
42         10    
43         10    
44         10    
45         10    
46         11    
47         11    
48         11    
49         11    
50         11    
51         11    
52         11    
53         12    
54         12    
55        

In [24]:
structures_dtypes = {
    'molecule_name': 'category',
    'atom_index': 'int8',
    'atom': 'category',
    'x': 'float32',
    'y': 'float32',
    'z': 'float32'
}
structures_csv = pd.read_csv(f'{DATA_PATH}/structures.csv', dtype=structures_dtypes)
display(structures_csv.head())

# atom name=>index atom=>原子番号
structures_csv['molecule_index'] = structures_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
structures_csv = structures_csv[['molecule_index', 'atom_index', 'atom', 'x', 'y', 'z']]
structures_csv['atom'] = structures_csv['atom'].replace(ATOMIC_NUMBERS).astype('int8')
structures_csv.head(10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


Unnamed: 0,molecule_index,atom_index,atom,x,y,z
0,1,0,6,-0.012698,1.085804,0.008001
1,1,1,1,0.00215,-0.006031,0.001976
2,1,2,1,1.011731,1.463751,0.000277
3,1,3,1,-0.540815,1.447527,-0.876644
4,1,4,1,-0.523814,1.437933,0.906397
5,2,0,7,-0.040426,1.024108,0.062564
6,2,1,1,0.017257,0.012545,-0.027377
7,2,2,1,0.915789,1.358745,-0.028758
8,2,3,1,-0.520278,1.343532,-0.775543
9,3,0,8,-0.03436,0.97754,0.007602


In [27]:
def build_type_dataframes(base, structures, coupling_type):
    # type列を削除
    base = base[base['type'] == coupling_type].drop('type', axis=1).copy()
    base = base.reset_index()
    # coupling_typeでのidを列で保存
    base['id'] = base['id'].astype('int32')
    # molecue_indexがtrainにあるものの特徴を取得
    structures = structures[structures['molecule_index'].isin(base['molecule_index'])]
    return base, structures
#train_csv[train_csv['type']=='2JHC'].drop(['type'],axis=1).reset_index()
# structures_csv[structures_csv['molecule_index'].isin(train_csv['molecule_index'])]

Unnamed: 0,molecule_index,atom_index,atom,x,y,z
0,1,0,6,-0.012698,1.085804,0.008001
1,1,1,1,0.002150,-0.006031,0.001976
2,1,2,1,1.011731,1.463751,0.000277
3,1,3,1,-0.540815,1.447527,-0.876644
4,1,4,1,-0.523814,1.437933,0.906397
5,2,0,7,-0.040426,1.024108,0.062564
6,2,1,1,0.017257,0.012545,-0.027377
7,2,2,1,0.915789,1.358745,-0.028758
8,2,3,1,-0.520278,1.343532,-0.775543
9,3,0,8,-0.034360,0.977540,0.007602


In [28]:
def add_coordinates(base, structures, index):
    df = pd.merge(base, structures, how='inner',
                  left_on=['molecule_index', f'atom_index_{index}'],
                  right_on=['molecule_index', 'atom_index']).drop(['atom_index'], axis=1)
    df = df.rename(columns={
        'atom': f'atom_{index}',
        'x': f'x_{index}',
        'y': f'y_{index}',
        'z': f'z_{index}'
    })
    return df
add_coordinates(train_csv,structures_csv,1)

Unnamed: 0,molecule_index,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_1,x_1,y_1,z_1
0,1,1,0,1JHC,84.807602,6,-0.012698,1.085804,0.008001
1,1,2,0,1JHC,84.807404,6,-0.012698,1.085804,0.008001
2,1,3,0,1JHC,84.809303,6,-0.012698,1.085804,0.008001
3,1,4,0,1JHC,84.809502,6,-0.012698,1.085804,0.008001
4,1,1,2,2JHH,-11.257000,1,1.011731,1.463751,0.000277
5,1,1,3,2JHH,-11.254800,1,-0.540815,1.447527,-0.876644
6,1,2,3,2JHH,-11.254100,1,-0.540815,1.447527,-0.876644
7,1,1,4,2JHH,-11.254300,1,-0.523814,1.437933,0.906397
8,1,2,4,2JHH,-11.254800,1,-0.523814,1.437933,0.906397
9,1,3,4,2JHH,-11.254300,1,-0.523814,1.437933,0.906397


In [29]:
def add_atoms(base, atoms):
    df = pd.merge(base, atoms, how='inner',
                  on=['molecule_index', 'atom_index_0', 'atom_index_1'])
    return df

In [30]:
def merge_all_atoms(base, structures):
    df = pd.merge(base, structures, how='left',
                  left_on=['molecule_index'],
                  right_on=['molecule_index'])
    df = df[(df.atom_index_0 != df.atom_index) & (df.atom_index_1 != df.atom_index)]
    return df

In [36]:
# 2点の中心点
def add_center(df):
    df['x_c'] = ((df['x_1'] + df['x_0']) * np.float32(0.5))
    df['y_c'] = ((df['y_1'] + df['y_0']) * np.float32(0.5))
    df['z_c'] = ((df['z_1'] + df['z_0']) * np.float32(0.5))

# 中心点からの距離
def add_distance_to_center(df):
    df['d_c'] = ((
        (df['x_c'] - df['x'])**np.float32(2) +
        (df['y_c'] - df['y'])**np.float32(2) + 
        (df['z_c'] - df['z'])**np.float32(2)
    )**np.float32(0.5))

# 原子間距離
def add_distance_between(df, suffix1, suffix2):
    df[f'd_{suffix1}_{suffix2}'] = ((
        (df[f'x_{suffix1}'] - df[f'x_{suffix2}'])**np.float32(2) +
        (df[f'y_{suffix1}'] - df[f'y_{suffix2}'])**np.float32(2) + 
        (df[f'z_{suffix1}'] - df[f'z_{suffix2}'])**np.float32(2)
    )**np.float32(0.5))

In [31]:
def add_distances(df):
    n_atoms = 1 + max([int(c.split('_')[1]) for c in df.columns if c.startswith('x_')])
    
    for i in range(1, n_atoms):
        for vi in range(min(4, i)):
            add_distance_between(df, i, vi)

In [32]:
def add_n_atoms(base, structures):
    dfs = structures['molecule_index'].value_counts().rename('n_atoms').to_frame()
    return pd.merge(base, dfs, left_on='molecule_index', right_index=True)

In [39]:
def build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=10):
    base, structures = build_type_dataframes(some_csv, structures_csv, coupling_type)
    base = add_coordinates(base, structures, 0)
    base = add_coordinates(base, structures, 1)
    base = base.drop(['atom_0', 'atom_1'], axis=1)
    atoms = base.drop('id', axis=1).copy()
    if 'scalar_coupling_constant' in some_csv:
        atoms = atoms.drop(['scalar_coupling_constant'], axis=1)
        
    add_center(atoms)
    atoms = atoms.drop(['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1)

    atoms = merge_all_atoms(atoms, structures)
    
    add_distance_to_center(atoms)
    
    atoms = atoms.drop(['x_c', 'y_c', 'z_c', 'atom_index'], axis=1)
    atoms.sort_values(['molecule_index', 'atom_index_0', 'atom_index_1', 'd_c'], inplace=True)
    atom_groups = atoms.groupby(['molecule_index', 'atom_index_0', 'atom_index_1'])
    atoms['num'] = atom_groups.cumcount() + 2
    atoms = atoms.drop(['d_c'], axis=1)
    atoms = atoms[atoms['num'] < n_atoms]

    atoms = atoms.set_index(['molecule_index', 'atom_index_0', 'atom_index_1', 'num']).unstack()
    atoms.columns = [f'{col[0]}_{col[1]}' for col in atoms.columns]
    atoms = atoms.reset_index()
    
    # downcast back to int8
    for col in atoms.columns:
        if col.startswith('atom_'):
            atoms[col] = atoms[col].fillna(0).astype('int8')
            
    atoms['molecule_index'] = atoms['molecule_index'].astype('int32')
    
    full = add_atoms(base, atoms)
    add_distances(full)
    
    full.sort_values('id', inplace=True)
    

    return full

In [45]:
# coupling_typeで原子数が異なるため、atom_9が最大となる

full=build_couple_dataframe(train_csv, structures_csv,'1JHN', n_atoms=10)

Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant
0,10,2,1,0,32.6889
1,13,2,2,0,32.689098
2,15,2,3,0,32.690498
3,97,12,3,0,55.5252
4,101,12,4,0,54.735901


Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant,atom_0,x_0,y_0,z_0
0,10,2,1,0,32.6889,1,0.017257,0.012545,-0.027377
1,13,2,2,0,32.689098,1,0.915789,1.358745,-0.028758
2,15,2,3,0,32.690498,1,-0.520278,1.343532,-0.775543
3,97,12,3,0,55.5252,1,0.825355,1.885049,0.003738
4,101,12,4,0,54.735901,1,-0.908377,1.826796,0.01892


Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,10,2,1,0,32.6889,1,0.017257,0.012545,-0.027377,7,-0.040426,1.024108,0.062564
1,13,2,2,0,32.689098,1,0.915789,1.358745,-0.028758,7,-0.040426,1.024108,0.062564
2,15,2,3,0,32.690498,1,-0.520278,1.343532,-0.775543,7,-0.040426,1.024108,0.062564
3,97,12,3,0,55.5252,1,0.825355,1.885049,0.003738,7,-0.0259,1.346146,0.008894
4,101,12,4,0,54.735901,1,-0.908377,1.826796,0.01892,7,-0.0259,1.346146,0.008894


Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant,x_0,y_0,z_0,x_1,y_1,z_1
0,10,2,1,0,32.6889,0.017257,0.012545,-0.027377,-0.040426,1.024108,0.062564
1,13,2,2,0,32.689098,0.915789,1.358745,-0.028758,-0.040426,1.024108,0.062564
2,15,2,3,0,32.690498,-0.520278,1.343532,-0.775543,-0.040426,1.024108,0.062564
3,97,12,3,0,55.5252,0.825355,1.885049,0.003738,-0.0259,1.346146,0.008894
4,101,12,4,0,54.735901,-0.908377,1.826796,0.01892,-0.0259,1.346146,0.008894


Unnamed: 0,molecule_index,atom_index_0,atom_index_1,atom_2,atom_3,atom_4,atom_5,atom_6,atom_7,atom_8,atom_9,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_9,z_2,z_3,z_4,z_5,z_6,z_7,z_8,z_9
0,2,1,0,1,1,0,0,0,0,0,0,0.915789,-0.520278,,,,,,,1.358745,1.343532,,,,,,,-0.028758,-0.775543,,,,,,
1,2,2,0,1,1,0,0,0,0,0,0,0.017257,-0.520278,,,,,,,0.012545,1.343532,,,,,,,-0.027377,-0.775543,,,,,,
2,2,3,0,1,1,0,0,0,0,0,0,0.915789,0.017257,,,,,,,1.358745,0.012545,,,,,,,-0.028758,-0.027377,,,,,,
3,12,3,0,1,6,8,1,0,0,0,0,-0.908377,0.046467,1.071835,-0.961441,,,,,1.826796,-0.011743,-0.652588,-0.475004,,,,,0.01892,0.001204,-0.011133,0.008074,,,,
4,12,4,0,1,6,1,8,0,0,0,0,0.825355,0.046467,-0.961441,1.071835,,,,,1.885049,-0.011743,-0.475004,-0.652588,,,,,0.003738,0.001204,0.008074,-0.011133,,,,


Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant,x_0,y_0,z_0,x_1,y_1,z_1,atom_2,atom_3,atom_4,atom_5,atom_6,atom_7,atom_8,atom_9,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_9,z_2,z_3,z_4,z_5,z_6,z_7,z_8,z_9,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,d_7_0,d_7_1,d_7_2,d_7_3,d_8_0,d_8_1,d_8_2,d_8_3,d_9_0,d_9_1,d_9_2,d_9_3
0,10,2,1,0,32.6889,0.017257,0.012545,-0.027377,-0.040426,1.024108,0.062564,1,1,0,0,0,0,0,0,0.915789,-0.520278,,,,,,,1.358745,1.343532,,,,,,,-0.028758,-0.775543,,,,,,,1.01719,1.618523,1.017187,1.61871,1.017208,1.618706,,,,,,,,,,,,,,,,,,,,,,,,
1,13,2,2,0,32.689098,0.915789,1.358745,-0.028758,-0.040426,1.024108,0.062564,1,1,0,0,0,0,0,0,0.017257,-0.520278,,,,,,,0.012545,1.343532,,,,,,,-0.027377,-0.775543,,,,,,,1.017187,1.618523,1.01719,1.618706,1.017208,1.61871,,,,,,,,,,,,,,,,,,,,,,,,
2,15,2,3,0,32.690498,-0.520278,1.343532,-0.775543,-0.040426,1.024108,0.062564,1,1,0,0,0,0,0,0,0.915789,0.017257,,,,,,,1.358745,0.012545,,,,,,,-0.028758,-0.027377,,,,,,,1.017208,1.618706,1.017187,1.61871,1.01719,1.618523,,,,,,,,,,,,,,,,,,,,,,,,
3,97,12,3,0,55.5252,0.825355,1.885049,0.003738,-0.0259,1.346146,0.008894,1,6,8,1,0,0,0,0,-0.908377,0.046467,1.071835,-0.961441,,,,,1.826796,-0.011743,-0.652588,-0.475004,,,,,0.01892,0.001204,-0.011133,0.008074,,,,,1.007511,1.734777,1.004933,2.050487,1.359838,2.071779,2.549623,2.28043,3.173246,1.20922,2.960154,2.047394,2.302437,1.109295,,,,,,,,,,,,,,,,
4,101,12,4,0,54.735901,-0.908377,1.826796,0.01892,-0.0259,1.346146,0.008894,1,6,1,8,0,0,0,0,0.825355,0.046467,-0.961441,1.071835,,,,,1.885049,-0.011743,-0.475004,-0.652588,,,,,0.003738,0.001204,0.008074,-0.011133,,,,,1.004933,1.734777,1.007511,2.071779,1.359838,2.050487,2.302437,2.047394,2.960154,1.109295,3.173246,2.28043,2.549623,1.20922,,,,,,,,,,,,,,,,


In [46]:
def take_n_atoms(df, n_atoms, four_start=4):
    labels = []
    for i in range(2, n_atoms):
        label = f'atom_{i}'
        labels.append(label)

    for i in range(n_atoms):
        num = min(i, 4) if i < four_start else 4
        for j in range(num):
            labels.append(f'd_{i}_{j}')
    if 'scalar_coupling_constant' in df:
        labels.append('scalar_coupling_constant')
    return df[labels]

In [48]:
# 1JHNは原子数が6までしかないのでいらない特徴量列を削減
df=take_n_atoms(full,7)
df.fillna(0)

Unnamed: 0,atom_2,atom_3,atom_4,atom_5,atom_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,scalar_coupling_constant
0,1,1,0,0,0,1.017190,1.618523,1.017187,1.618710,1.017208,1.618706,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.688900
1,1,1,0,0,0,1.017187,1.618523,1.017190,1.618706,1.017208,1.618710,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.689098
2,1,1,0,0,0,1.017208,1.618706,1.017187,1.618710,1.017190,1.618523,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.690498
3,1,6,8,1,0,1.007511,1.734777,1.004933,2.050487,1.359838,2.071779,2.549623,2.280430,3.173246,1.209220,2.960154,2.047394,2.302437,1.109295,0.000000,0.000000,0.000000,0.000000,55.525200
4,1,6,1,8,0,1.004933,1.734777,1.007511,2.071779,1.359838,2.050487,2.302437,2.047394,2.960154,1.109295,3.173246,2.280430,2.549623,1.209220,0.000000,0.000000,0.000000,0.000000,54.735901
5,1,6,6,1,8,1.004771,1.727509,1.006952,2.084700,1.369356,2.043309,2.596395,2.434870,3.357022,1.522602,2.472166,2.627629,3.622846,2.184789,3.164912,2.263603,2.493768,1.213961,54.063999
6,1,6,8,6,1,1.006952,1.727509,1.004771,2.043309,1.369356,2.084700,2.493768,2.263603,3.164912,1.213961,3.357022,2.434870,2.596395,1.522602,3.622846,2.627629,2.472166,2.184789,56.186001
7,1,6,1,1,6,1.014969,1.631013,1.015277,2.035264,1.463593,2.046443,2.359597,2.088941,2.937592,1.096093,2.474020,2.178272,2.462809,1.102846,3.285424,2.412301,2.674642,1.467535,37.719002
8,1,6,1,6,1,1.015277,1.631013,1.014969,2.046443,1.463593,2.035264,2.462809,2.178272,2.474020,1.102846,2.674642,2.412301,3.285424,1.467535,2.937592,2.088941,2.359597,1.096093,38.349499
9,6,6,1,1,6,1.004448,2.115390,1.372028,2.115394,1.372027,2.245993,2.506449,2.137405,1.078110,3.260867,2.506459,2.137407,3.260868,1.078109,3.185715,2.218043,1.375401,2.255881,59.300999


In [49]:
X_data=df.drop(['scalar_coupling_constant'],axis=1)
y_data=df.scalar_coupling_constant
X_train,X_val,y_train,y_val=train_test_splita(X_data,y_data,test=)

SyntaxError: invalid syntax (<ipython-input-49-7d45b0f559b7>, line 3)