In [1]:
import numpy as np
import pandas as pd
import fastai
from tqdm import tqdm_notebook as tqdm
from fastai.tabular import *
import pickle

from multiprocessing import Pool
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
np.range = (lambda x:(x.min(), x.max()))

In [2]:
DATA_PATH = '../input'
SUBMISSIONS_PATH = './'
# use atomic numbers to recode atomic names
ATOMIC_NUMBERS = {
    'H': 1,
    'C': 6,
    'N': 7,
    'O': 8,
    'F': 9
}

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)

In [4]:
train_dtypes = {
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}
train_csv = pd.read_csv(f'{DATA_PATH}/train.csv', index_col='id', dtype=train_dtypes)
#train_csv['molecule_name'] = train_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
train_csv = train_csv[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant']]
train_csv.head(10)

  mask |= (ar1 == a)


Unnamed: 0_level_0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,dsgdb9nsd_000001,1,0,1JHC,84.807602
1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,dsgdb9nsd_000001,2,0,1JHC,84.807404
5,dsgdb9nsd_000001,2,3,2JHH,-11.2541
6,dsgdb9nsd_000001,2,4,2JHH,-11.2548
7,dsgdb9nsd_000001,3,0,1JHC,84.809303
8,dsgdb9nsd_000001,3,4,2JHH,-11.2543
9,dsgdb9nsd_000001,4,0,1JHC,84.809502


In [5]:
test_csv = pd.read_csv(f'{DATA_PATH}/test.csv', index_col='id', dtype=train_dtypes)
#test_csv['molecule_index'] = test_csv['molecule_name'].str.replace('dsgdb9nsd_', '').astype('int32')
test_csv = test_csv[['molecule_name', 'atom_index_0', 'atom_index_1', 'type']]
test_csv.head(10)

Unnamed: 0_level_0,molecule_name,atom_index_0,atom_index_1,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4658147,dsgdb9nsd_000004,2,0,2JHC
4658148,dsgdb9nsd_000004,2,1,1JHC
4658149,dsgdb9nsd_000004,2,3,3JHH
4658150,dsgdb9nsd_000004,3,0,1JHC
4658151,dsgdb9nsd_000004,3,1,2JHC
4658152,dsgdb9nsd_000015,3,0,1JHC
4658153,dsgdb9nsd_000015,3,2,3JHC
4658154,dsgdb9nsd_000015,3,4,2JHH
4658155,dsgdb9nsd_000015,3,5,2JHH
4658156,dsgdb9nsd_000015,4,0,1JHC


In [6]:
structures_dtypes = {
    'molecule_name': 'category',
    'atom_index': 'int8',
    'atom': 'category',
    'x': 'float32',
    'y': 'float32',
    'z': 'float32'
}
structures_csv = pd.read_csv(f'{DATA_PATH}/structures.csv', dtype=structures_dtypes)
#structures_csv['molecule_index'] = structures_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
structures_csv = structures_csv[['molecule_name', 'atom_index', 'atom', 'x', 'y', 'z']]
structures_csv['atom'] = structures_csv['atom'].replace(ATOMIC_NUMBERS).astype('int8')
structures_csv.head(10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,6,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,1,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,1,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,1,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,1,-0.523814,1.437933,0.906397
5,dsgdb9nsd_000002,0,7,-0.040426,1.024108,0.062564
6,dsgdb9nsd_000002,1,1,0.017257,0.012545,-0.027377
7,dsgdb9nsd_000002,2,1,0.915789,1.358745,-0.028758
8,dsgdb9nsd_000002,3,1,-0.520278,1.343532,-0.775543
9,dsgdb9nsd_000003,0,8,-0.03436,0.97754,0.007602


In [7]:
def build_type_dataframes(base, structures, coupling_type):
    base = base[base['type'] == coupling_type].drop('type', axis=1).copy()
    base = base.reset_index()
    base['id'] = base['id'].astype('int32')
    structures = structures[structures['molecule_name'].isin(base['molecule_name'])]
    return base, structures


def add_coordinates(base, structures, index):
    df = pd.merge(base, structures, how='inner',
                  left_on=['molecule_name', f'atom_index_{index}'],
                  right_on=['molecule_name', 'atom_index']).drop(['atom_index'], axis=1)
    df = df.rename(columns={
        'atom': f'atom_{index}',
        'x': f'x_{index}',
        'y': f'y_{index}',
        'z': f'z_{index}'
    })
    return df

def add_atoms(base, atoms):
    df = pd.merge(base, atoms, how='inner',
                  on=['molecule_name', 'atom_index_0', 'atom_index_1'])
    return df


def merge_all_atoms(base, structures):
    df = pd.merge(base, structures, how='left',
                  left_on=['molecule_name'],
                  right_on=['molecule_name'])
    df = df[(df.atom_index_0 != df.atom_index) & (df.atom_index_1 != df.atom_index)]
    return df

def add_center(df):
    df['x_c'] = ((df['x_1'] + df['x_0']) * np.float32(0.5))
    df['y_c'] = ((df['y_1'] + df['y_0']) * np.float32(0.5))
    df['z_c'] = ((df['z_1'] + df['z_0']) * np.float32(0.5))

def add_distance_to_center(df):
    df['d_c'] = ((
        (df['x_c'] - df['x'])**np.float32(2) +
        (df['y_c'] - df['y'])**np.float32(2) + 
        (df['z_c'] - df['z'])**np.float32(2)
    )**np.float32(0.5))

def add_distance_between(df, suffix1, suffix2):
    df[f'd_{suffix1}_{suffix2}'] = ((
        (df[f'x_{suffix1}'] - df[f'x_{suffix2}'])**np.float32(2) +
        (df[f'y_{suffix1}'] - df[f'y_{suffix2}'])**np.float32(2) + 
        (df[f'z_{suffix1}'] - df[f'z_{suffix2}'])**np.float32(2)
    )**np.float32(0.5))
    
    
def add_distances(df):
    n_atoms = 1 + max([int(c.split('_')[1]) for c in df.columns if c.startswith('x_')])
    
    for i in range(1, n_atoms):
        for vi in range(min(4, i)):
            add_distance_between(df, i, vi)
            
def add_n_atoms(base, structures):
    dfs = structures['molecule_name'].value_counts().rename('n_atoms').to_frame()
    return pd.merge(base, dfs, left_on='molecule_name', right_index=True)

def build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=10):
    base, structures = build_type_dataframes(some_csv, structures_csv, coupling_type)
    base = add_coordinates(base, structures, 0)
    base = add_coordinates(base, structures, 1)
    
    base = base.drop(['atom_0', 'atom_1'], axis=1)
    atoms = base.drop('id', axis=1).copy()
    if 'scalar_coupling_constant' in some_csv:
        atoms = atoms.drop(['scalar_coupling_constant'], axis=1)
        
    add_center(atoms)
    atoms = atoms.drop(['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1)

    atoms = merge_all_atoms(atoms, structures)
    
    add_distance_to_center(atoms)
    
    atoms = atoms.drop(['x_c', 'y_c', 'z_c', 'atom_index'], axis=1)
    atoms.sort_values(['molecule_name', 'atom_index_0', 'atom_index_1', 'd_c'], inplace=True)
    atom_groups = atoms.groupby(['molecule_name', 'atom_index_0', 'atom_index_1'])
    atoms['num'] = atom_groups.cumcount() + 2
    atoms = atoms.drop(['d_c'], axis=1)
    atoms = atoms[atoms['num'] < n_atoms]

    atoms = atoms.set_index(['molecule_name', 'atom_index_0', 'atom_index_1', 'num']).unstack()
    atoms.columns = [f'{col[0]}_{col[1]}' for col in atoms.columns]
    atoms = atoms.reset_index()
    
    # downcast back to int8
    for col in atoms.columns:
        if col.startswith('atom_'):
            atoms[col] = atoms[col].fillna(0).astype('int8')
            
    #atoms['molecule_name'] = atoms['molecule_name'].astype('int32')
    
    full = add_atoms(base, atoms)
    add_distances(full)
    
    full.sort_values('id', inplace=True)
    
    return full


def angle(df, u=['x_0', 'y_0', 'z_0'], v=['x_1', 'y_1', 'z_1']):
    u, v = df[u].values, df[v].values
    c = (u*v).sum(-1)/np.linalg.norm(u, axis=-1)/np.linalg.norm(v, axis=-1)
    c = c.clip(-1, 1)
    return np.arccos(c)

In [8]:
train_pre, test_pre = None, None
for t in tqdm(['3JHC', '2JHC', '1JHC', '3JHH', '2JHH', '3JHN', '2JHN', '1JHN']):
    df = build_couple_dataframe(train_csv, structures_csv, t, n_atoms=7)
    df['type'] = t
    df = df.drop(columns=['x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'y_2', 'y_3', 'y_4', 'y_5', 'y_6', 'z_2', 'z_3', 'z_4', 'z_5', 'z_6'])
    if train_pre is None: train_pre = df
    else: train_pre = train_pre.append(df)
        
    df = build_couple_dataframe(test_csv, structures_csv, t, n_atoms=7)
    df['type'] = t
    df = df.drop(columns=['x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'y_2', 'y_3', 'y_4', 'y_5', 'y_6', 'z_2', 'z_3', 'z_4', 'z_5', 'z_6'])
    if test_pre is None: test_pre = df
    else: test_pre = test_pre.append(df)
        
test_pre = test_pre.fillna(0).set_index('id')
train_pre = train_pre.fillna(0).set_index('id')

train_pre['angle'] = angle(train_pre)
test_pre['angle']  = angle(test_pre)

#train_pre = train_pre.drop(columns=['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'])
#test_pre  = test_pre.drop( columns=['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'])

test_pre = test_pre.sort_index()
train_pre = train_pre.sort_index()
test_pre = test_pre.reset_index()
train_pre = train_pre.reset_index()

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [9]:
train_csv.shape, train_pre.shape, test_pre.shape, test_csv.shape

((4658147, 5), (4658147, 36), (2505542, 35), (2505542, 4))

In [10]:
train_pre = train_pre.set_index('id')
train_pre = train_pre.sort_index()
train_pre = train_pre.reset_index()

In [11]:
train_pre.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,scalar_coupling_constant,x_0,y_0,z_0,x_1,y_1,z_1,atom_2,atom_3,atom_4,atom_5,atom_6,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,type,angle
0,0,dsgdb9nsd_000001,1,0,84.807602,0.00215,-0.006031,0.001976,-0.012698,1.085804,0.008001,1,1,1,0,0,1.091953,1.78312,1.091952,1.783147,1.091946,1.783157,1.783157,1.091948,1.783148,1.783148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1JHC,2.694086
1,1,dsgdb9nsd_000001,1,2,-11.257,0.00215,-0.006031,0.001976,1.011731,1.463751,0.000277,6,1,1,0,0,1.78312,1.091953,1.091952,1.783157,1.783148,1.091948,1.783147,1.783157,1.091946,1.783148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2JHH,2.162629
2,2,dsgdb9nsd_000001,1,3,-11.2548,0.00215,-0.006031,0.001976,-0.540815,1.447527,-0.876644,6,1,1,0,0,1.783147,1.091953,1.091946,1.78312,1.783157,1.091952,1.783157,1.783148,1.091948,1.783148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2JHH,2.924457
3,3,dsgdb9nsd_000001,1,4,-11.2543,0.00215,-0.006031,0.001976,-0.523814,1.437933,0.906397,6,1,1,0,0,1.783157,1.091953,1.091948,1.78312,1.783148,1.091952,1.783147,1.783148,1.091946,1.783157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2JHH,2.307507
4,4,dsgdb9nsd_000001,2,0,84.807404,1.011731,1.463751,0.000277,-0.012698,1.085804,0.008001,1,1,1,0,0,1.091952,1.78312,1.091953,1.783148,1.091948,1.783157,1.783157,1.091946,1.783147,1.783148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1JHC,0.616519


In [12]:
train_csv.head()

Unnamed: 0_level_0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,dsgdb9nsd_000001,1,0,1JHC,84.807602
1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,dsgdb9nsd_000001,2,0,1JHC,84.807404


In [13]:
structures_csv[structures_csv.molecule_name=='dsgdb9nsd_000001']

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,6,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,1,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,1,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,1,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,1,-0.523814,1.437933,0.906397


In [14]:
train_pre['type_i'] = train_pre.type.apply(lambda x: int(list(x)[0])-1).astype(np.int8)
train_pre['type_a'] = train_pre.type.apply(lambda x: list(x)[-1])

test_pre['type_i'] = test_pre.type.apply(lambda x: int(list(x)[0])-1).astype(np.int8)
test_pre['type_a'] = test_pre.type.apply(lambda x: list(x)[-1])

type_a_encoder   = LabelEncoder().fit(train_pre.type_a)
train_pre.type_a = type_a_encoder.transform(train_pre.type_a).astype(np.int8)
test_pre.type_a  = type_a_encoder.transform(test_pre.type_a).astype(np.int8)

type_encoder = LabelEncoder().fit(train_pre.type)
train_pre.type = type_encoder.transform(train_pre.type).astype(np.int8)
train_pre.atom_index_0 += 1
train_pre.atom_index_1 += 1

test_pre.type = type_encoder.transform(test_pre.type).astype(np.int8)
test_pre.atom_index_0 += 1
test_pre.atom_index_1 += 1

In [15]:
atom_encoder = LabelEncoder().fit([0,1,6,7,8,9])
for atom_name in ['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6']:
    train_pre[atom_name] = atom_encoder.transform(train_pre[atom_name]).astype(np.int8)
    test_pre[atom_name ] = atom_encoder.transform(test_pre[atom_name ]).astype(np.int8)

In [16]:
len(train_pre.columns)

38

In [17]:
column_names = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type_i', 'type_a', 'type', 'atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'angle', 
                'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 
                'd_5_1', 'd_5_2', 'd_5_3','d_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
                'scalar_coupling_constant']
train_pre = train_pre[column_names]

test_pre =  test_pre[column_names[:-1]]

In [18]:
len(column_names)

32

In [19]:
train_pre.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type_i',
       'type_a', 'type', 'atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6',
       'angle', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'scalar_coupling_constant'],
      dtype='object')

In [20]:
conts = ['angle', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3', 
         'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3','d_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
        ]

### standardize

In [21]:
for cont in tqdm(conts):
    m, s = train_pre[cont].mean(), train_pre[cont].std()
    train_pre[cont] = ((train_pre[cont] - m)/s).astype(np.float32)
    test_pre[cont ] = ((test_pre[cont]  - m)/s).astype(np.float32)

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




In [22]:
train_pre[conts].describe()

Unnamed: 0,angle,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3
count,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0,4658147.0
mean,-0.001749185,0.01857403,0.06583207,0.06287307,0.02153001,0.04157142,0.04258126,-0.007948117,0.04769866,0.06383684,0.03923139,-0.04306484,0.02971388,0.04452496,0.02774148,-0.01965844,0.0266296,0.02849898,0.01095967
std,0.9999884,1.001702,1.008041,1.00469,1.003871,0.9909114,0.9970512,0.9986135,0.9982445,0.9997711,1.000986,0.9983974,1.000056,1.001491,1.001318,1.000149,0.9979293,0.9987681,0.9962853
min,-1.557116,-1.834749,-1.169063,-1.44981,-3.225297,-3.119366,-4.246944,-5.153661,-3.58516,-3.425627,-3.487261,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405
25%,-0.7982832,-0.5439885,-0.9041336,-1.041214,-1.269445,-0.791401,-0.4502157,-0.4446263,-0.7551433,-0.6484908,-0.6295125,-0.7297942,-0.7272984,-0.8990283,-0.9532807,-0.6380075,-1.045526,-0.314344,-0.7962322
50%,-0.2732248,-0.04617299,0.3949407,0.1102241,0.3739999,-0.0528206,-0.2770656,-0.1135471,0.3910827,0.5105792,0.02973343,-0.0825564,0.08095925,0.1326503,0.05696625,0.005870787,0.08394822,0.02799833,0.07706369
75%,0.7521172,0.8172447,1.176564,0.225946,0.7717648,0.9895137,1.122608,0.5705792,0.6995134,0.7699772,0.6433953,0.615481,0.6844449,0.6674007,0.81003,0.5933324,0.7047071,0.56778,0.7049202
max,3.255552,2.151099,2.033639,3.493843,3.459681,2.343346,3.995469,5.948771,3.634544,4.811091,4.542479,7.71615,5.371619,5.371699,4.012976,8.575139,6.29173,6.623074,4.989589


In [23]:
test_pre[conts].describe()

Unnamed: 0,angle,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3
count,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0,2505542.0
mean,0.001870482,0.01837534,0.0646394,0.06424154,0.02237896,0.04034468,0.04388591,-0.009822115,0.04936559,0.064355,0.03918195,-0.04285982,0.0295562,0.04384346,0.02884435,-0.01869155,0.02716279,0.02872533,0.01191294
std,1.002163,1.004041,1.001407,1.007008,1.00225,0.9944122,1.005277,1.000469,1.000214,1.000349,1.004237,1.000236,1.000816,1.001699,1.000773,1.001345,0.9992558,1.001457,0.9994797
min,-1.557116,-1.834772,-1.168822,-1.449458,-1.51559,-1.447528,-1.64696,-5.153661,-3.58516,-3.425627,-3.487261,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405
25%,-0.7967794,-0.5435347,-0.90413,-1.041277,-1.269415,-0.7957028,-0.4511842,-0.4486038,-0.7542096,-0.6503331,-0.6296535,-0.7297956,-0.7270318,-0.8988087,-0.9528455,-0.6371149,-1.044045,-0.3144204,-0.7942262
50%,-0.2706272,-0.04731366,0.3936405,0.1102375,0.3778717,-0.05495989,-0.2765411,-0.1141,0.3917809,0.5118628,0.03004983,-0.08274918,0.08060604,0.1327267,0.06035705,0.006195101,0.08428513,0.02931276,0.07783316
75%,0.758549,0.8170904,1.176181,0.2263631,0.7714604,0.9878154,1.126413,0.5709616,0.7033592,0.7705477,0.6442838,0.6149552,0.6829773,0.6666362,0.8132419,0.5944929,0.7034352,0.567888,0.7041502
max,3.253247,2.790596,2.119213,3.475595,3.461015,2.965536,3.37132,5.916121,3.69897,4.821081,3.667066,7.709538,5.418262,5.224132,4.028304,8.575206,6.291515,6.622846,4.989412


In [24]:
test_pre.isna().sum().sum(), train_pre.isna().sum().sum()

(0, 0)

In [25]:
train_pre.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type_i,type_a,type,atom_2,atom_3,atom_4,atom_5,atom_6,angle,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3,scalar_coupling_constant
0,0,dsgdb9nsd_000001,2,1,0,0,0,1,1,1,0,0,2.570007,-1.712379,0.454218,-1.066589,-0.045399,-1.218631,0.376567,-1.099632,-1.553749,-0.047694,-0.615493,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405,84.807602
1,1,dsgdb9nsd_000001,2,3,1,1,3,2,1,1,0,0,1.755858,-0.769607,-0.906534,-1.066589,-0.045383,-0.015466,-1.415656,-1.099652,-0.267853,-1.357082,-0.615493,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405,-11.257
2,2,dsgdb9nsd_000001,2,4,1,1,3,2,1,1,0,0,2.922918,-0.769569,-0.906534,-1.066604,-0.045449,-0.01545,-1.415646,-1.099632,-0.267871,-1.35708,-0.615492,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405,-11.2548
3,3,dsgdb9nsd_000001,2,5,1,1,3,2,1,1,0,0,1.9778,-0.769557,-0.906534,-1.066601,-0.045449,-0.015466,-1.415646,-1.099652,-0.267871,-1.357082,-0.615478,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405,-11.2543
4,4,dsgdb9nsd_000001,3,1,0,0,0,1,1,1,0,0,-0.612658,-1.712381,0.454218,-1.066586,-0.045398,-1.218629,0.376565,-1.09963,-1.553752,-0.047696,-0.615493,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405,84.807404


In [26]:
test_pre.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type_i,type_a,type,atom_2,atom_3,atom_4,atom_5,atom_6,angle,d_1_0,d_2_0,d_2_1,d_3_0,d_3_1,d_3_2,d_4_0,d_4_1,d_4_2,d_4_3,d_5_0,d_5_1,d_5_2,d_5_3,d_6_0,d_6_1,d_6_2,d_6_3
0,4658147,dsgdb9nsd_000004,3,1,1,0,2,2,1,0,0,0,0.846656,-0.117521,-0.96531,-0.758715,2.701123,-1.270586,1.616017,-5.153661,-3.58516,-3.425627,-3.487261,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405
1,4658148,dsgdb9nsd_000004,3,2,0,0,0,2,1,0,0,0,-0.808065,-1.7531,1.395408,-0.758715,2.701123,0.816633,-1.49305,-5.153661,-3.58516,-3.425627,-3.487261,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405
2,4658149,dsgdb9nsd_000004,3,4,2,1,6,2,2,0,0,0,1.595707,1.331213,1.395408,-1.152383,-1.331249,0.816633,-1.137877,-5.153661,-3.58516,-3.425627,-3.487261,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405
3,4658150,dsgdb9nsd_000004,4,1,0,0,0,2,1,0,0,0,-0.808065,-1.7531,1.395408,-0.758715,2.701123,0.816633,-1.49305,-5.153661,-3.58516,-3.425627,-3.487261,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405
4,4658151,dsgdb9nsd_000004,4,2,1,0,2,2,1,0,0,0,0.846656,-0.117521,-0.96531,-0.758715,2.701123,-1.270586,1.616017,-5.153661,-3.58516,-3.425627,-3.487261,-5.278745,-3.769542,-3.306074,-2.929187,-5.154486,-3.472672,-3.678914,-3.024405


In [27]:
with open('train_test_pre.pkl', 'wb') as f:
    pickle.dump([train_pre, test_pre, column_names], f)

In [28]:
train_pre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4658147 entries, 0 to 4658146
Data columns (total 32 columns):
id                          int64
molecule_name               object
atom_index_0                int8
atom_index_1                int8
type_i                      int8
type_a                      int8
type                        int8
atom_2                      int8
atom_3                      int8
atom_4                      int8
atom_5                      int8
atom_6                      int8
angle                       float32
d_1_0                       float32
d_2_0                       float32
d_2_1                       float32
d_3_0                       float32
d_3_1                       float32
d_3_2                       float32
d_4_0                       float32
d_4_1                       float32
d_4_2                       float32
d_4_3                       float32
d_5_0                       float32
d_5_1                       float32
d_5_2               

In [29]:
train_pre.memory_usage(deep=True).sum()/1024/1024

759.6428079605103