In [1]:
import numpy as np
import pandas as pd
import fastai
from tqdm import tqdm_notebook as tqdm
from fastai.tabular import *

np.range = (lambda x:(x.min(), x.max()))

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
structures = pd.read_csv('structures.csv')

In [3]:
train.isna().sum(), structures.isna().sum()

(id                          0
 molecule_name               0
 atom_index_0                0
 atom_index_1                0
 type                        0
 scalar_coupling_constant    0
 dtype: int64, molecule_name    0
 atom_index       0
 atom             0
 x                0
 y                0
 z                0
 dtype: int64)

## no nulls

In [4]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [5]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [6]:
structures.describe()

Unnamed: 0,atom_index,x,y,z
count,2358657.0,2358657.0,2358657.0,2358657.0
mean,8.757349,0.09489178,-0.3337381,0.06241504
std,5.592487,1.655271,1.989152,1.44587
min,0.0,-9.234889,-9.933938,-9.134765
25%,4.0,-0.8746097,-1.826156,-0.8424896
50%,9.0,0.05183615,-0.4035932,0.01093207
75%,13.0,1.116101,1.37366,0.9394357
max,28.0,9.38224,10.18196,7.894733


In [7]:
train.describe()

Unnamed: 0,id,atom_index_0,atom_index_1,scalar_coupling_constant
count,4658147.0,4658147.0,4658147.0,4658147.0
mean,2329073.0,13.35689,5.883966,15.92165
std,1344691.0,3.267712,4.993943,34.94198
min,0.0,0.0,0.0,-36.2186
25%,1164536.0,11.0,2.0,-0.254978
50%,2329073.0,13.0,5.0,2.28113
75%,3493610.0,16.0,8.0,7.390655
max,4658146.0,28.0,28.0,204.88


In [8]:
set(train.molecule_name).intersection(set(test.molecule_name))

set()

In [21]:
tmp = train.merge(structures.rename(columns=lambda x: x+'_0'), left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name_0', 'atom_index_0'])
joined = tmp.merge(structures.rename(columns=lambda x: x+'_1'), left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name_1', 'atom_index_1'])

joined = joined.drop(columns=['molecule_name_0', 'molecule_name_1'])

In [22]:
(joined.atom_0.value_counts()/len(joined))*100

H    100.0
Name: atom_0, dtype: float64

In [23]:
(joined.atom_1.value_counts()/len(joined))*100

C    72.141755
H    20.794685
N     7.063560
Name: atom_1, dtype: float64

In [10]:
joined.shape

(4658147, 14)

In [11]:
np.range(joined.id)

(0, 4658146)

In [12]:
joined.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644,C,-0.012698,1.085804,0.008001
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397,C,-0.012698,1.085804,0.008001
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277


In [13]:
GPD = joined.groupby(['atom_0', 'atom_1', 'type'])

GPD.agg(lambda x : len(x)/len(train))[['id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id
atom_0,atom_1,type,Unnamed: 3_level_1
H,C,1JHC,0.152296
H,C,2JHC,0.244877
H,C,3JHC,0.324245
H,H,2JHH,0.081156
H,H,3JHH,0.126791
H,N,1JHN,0.009309
H,N,2JHN,0.025601
H,N,3JHN,0.035726


In [14]:
GPD.agg(lambda x : np.range(x))#[['scalar_coupling_constant']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,molecule_name,atom_index_0,atom_index_1,scalar_coupling_constant,x_0,y_0,z_0,x_1,y_1,z_1
atom_0,atom_1,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
H,C,1JHC,"(0, 4658146)","(dsgdb9nsd_000001, dsgdb9nsd_133884)","(1, 28)","(0, 18)","(66.6008, 204.88)","(-9.234888942000001, 9.382240438)","(-9.494159758, 9.714469231)","(-9.134764787, 7.637577918)","(-8.677875692, 8.420892703)","(-8.432505696, 8.653664663999999)","(-8.439682331, 6.7962990670000005)"
H,C,2JHC,"(20, 4658145)","(dsgdb9nsd_000007, dsgdb9nsd_133884)","(0, 28)","(0, 17)","(-36.2186, 42.8192)","(-9.234888942000001, 9.382240438)","(-9.494159758, 9.714469231)","(-9.134764787, 7.637577918)","(-7.317725985, 7.468888258)","(-7.824675685, 7.5296495310000005)","(-8.439682331, 6.126566198)"
H,C,3JHC,"(58, 4658144)","(dsgdb9nsd_000009, dsgdb9nsd_133884)","(1, 28)","(0, 18)","(-18.5821, 76.0437)","(-9.234888942000001, 9.382240438)","(-9.494159758, 9.714469231)","(-9.134764787, 7.637577918)","(-7.347374907000001, 8.420892703)","(-8.432505696, 8.653664663999999)","(-8.439682331, 6.7962990670000005)"
H,H,2JHH,"(1, 4658120)","(dsgdb9nsd_000001, dsgdb9nsd_133884)","(1, 27)","(2, 28)","(-35.1761, 11.8542)","(-9.218969711, 8.220769838999999)","(-8.917475925, 8.118444459)","(-9.134764787, 7.1846107329999995)","(-9.234888942000001, 8.230467342999999)","(-8.186595779, 8.093881642000001)","(-8.789131194, 6.965530277999999)"
H,H,3JHH,"(23, 4658138)","(dsgdb9nsd_000007, dsgdb9nsd_133884)","(0, 25)","(3, 28)","(-3.02046, 17.4841)","(-7.933076547000001, 7.04495728)","(-8.112642726, 7.960800674)","(-9.134764787, 6.554751062)","(-7.942035177999999, 8.230467342999999)","(-9.25440519, 8.118444459)","(-7.986579357, 7.46022491)"
H,N,1JHN,"(10, 4656569)","(dsgdb9nsd_000002, dsgdb9nsd_133808)","(0, 23)","(0, 11)","(24.3222, 80.4187)","(-7.942035177999999, 6.608726142999999)","(-8.038531287, 6.92555625)","(-6.73856946, 6.775142917999999)","(-7.019292425, 6.793022522)","(-7.021651827, 6.837290267)","(-5.92542783, 6.29136018)"
H,N,2JHN,"(18, 4658130)","(dsgdb9nsd_000005, dsgdb9nsd_133884)","(2, 25)","(0, 11)","(-2.62085, 17.7436)","(-7.933076547000001, 8.159108193)","(-7.350395892000001, 8.050028205)","(-6.750298797, 6.7791448260000005)","(-7.019292425, 6.793022522)","(-7.021651827, 6.837290267)","(-5.92542783, 6.29136018)"
H,N,3JHN,"(73, 4658139)","(dsgdb9nsd_000010, dsgdb9nsd_133884)","(1, 25)","(0, 11)","(-3.1724099999999997, 10.9712)","(-7.735907448, 8.072174217999999)","(-7.3163323039999995, 7.6339779839999995)","(-6.716641838999999, 6.680448625)","(-7.311588747999999, 8.359691326)","(-8.382658702999999, 8.59417429)","(-6.4086470239999995, 6.74237697)"


In [15]:
tmp = test.merge(structures.rename(columns=lambda x: x+'_0'), left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name_0', 'atom_index_0'])
joined = tmp.merge(structures.rename(columns=lambda x: x+'_1'), left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name_1', 'atom_index_1'])
joined = joined.drop(columns=['molecule_name_0', 'molecule_name_1'])
GPD = joined.groupby(['atom_0', 'atom_1', 'type'])
GPD.agg(lambda x : len(x)/len(test))[['id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id
atom_0,atom_1,type,Unnamed: 3_level_1
H,C,1JHC,0.151907
H,C,2JHC,0.244713
H,C,3JHC,0.324081
H,H,2JHH,0.081071
H,H,3JHH,0.126693
H,N,1JHN,0.009657
H,N,2JHN,0.025713
H,N,3JHN,0.036166


## train test distributions look similar and are disjoint on molecules

In [16]:
structures.head(n=10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602


In [17]:
structures.describe()

Unnamed: 0,atom_index,x,y,z
count,2358657.0,2358657.0,2358657.0,2358657.0
mean,8.757349,0.09489178,-0.3337381,0.06241504
std,5.592487,1.655271,1.989152,1.44587
min,0.0,-9.234889,-9.933938,-9.134765
25%,4.0,-0.8746097,-1.826156,-0.8424896
50%,9.0,0.05183615,-0.4035932,0.01093207
75%,13.0,1.116101,1.37366,0.9394357
max,28.0,9.38224,10.18196,7.894733


In [20]:
(structures.atom.value_counts()/len(structures))*100

H    51.231993
C    35.262694
O     7.766581
N     5.611710
F     0.127021
Name: atom, dtype: float64

In [19]:
structures.atom.unique()

array(['C', 'H', 'N', 'O', 'F'], dtype=object)