In [1]:
import numpy as np
import pandas as pd
import fastai
from tqdm import tqdm_notebook as tqdm
from fastai.tabular import *
import pickle

from multiprocessing import Pool
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
np.range = (lambda x:(x.min(), x.max()))

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
structures = pd.read_csv('structures.csv')

In [3]:
train['type_i'] = train.type.apply(lambda x: int(list(x)[0])-1)
train['type_a'] = LabelEncoder().fit_transform(train.type.apply(lambda x: list(x)[-1]))
train = train.drop(columns=['type'])
train.atom_index_0 += 1
train.atom_index_1 += 1
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,scalar_coupling_constant,type_i,type_a
0,0,dsgdb9nsd_000001,2,1,84.8076,0,0
1,1,dsgdb9nsd_000001,2,3,-11.257,1,1
2,2,dsgdb9nsd_000001,2,4,-11.2548,1,1
3,3,dsgdb9nsd_000001,2,5,-11.2543,1,1
4,4,dsgdb9nsd_000001,3,1,84.8074,0,0


In [4]:
print(train.type_i.value_counts()/len(train)*100,'\n\n', train.type_a.value_counts()/len(train)*100)

2    48.676115
1    35.163403
0    16.160482
Name: type_i, dtype: float64 

 0    72.141755
1    20.794685
2     7.063560
Name: type_a, dtype: float64


In [5]:
print(np.range(train.atom_index_0.unique()), np.range(train.atom_index_1.unique()))

(1, 29) (1, 29)


In [6]:
with open('molecules.pkl', 'rb') as f:
    molecules_structure, structure_cols, atom_encoder = pickle.load(f)

In [7]:
x = train.drop(columns=['scalar_coupling_constant', 'id'])
y = train.scalar_coupling_constant

In [8]:
print(x.head().to_string(), '\n\n', y.head().to_string())

      molecule_name  atom_index_0  atom_index_1  type_i  type_a
0  dsgdb9nsd_000001             2             1       0       0
1  dsgdb9nsd_000001             2             3       1       1
2  dsgdb9nsd_000001             2             4       1       1
3  dsgdb9nsd_000001             2             5       1       1
4  dsgdb9nsd_000001             3             1       0       0 

 0    84.8076
1   -11.2570
2   -11.2548
3   -11.2543
4    84.8074


In [9]:
class Dataset:
    def __init__(self, x, y=None):
        assert (y is None) or (len(x) == len(y))
        self.x, self.y = x, y
        
    def __getitem__(self, idx):
        if isinstance(idx, slice):
            if y is not None: return Dataset(self.x[idx], self.y[idx])
            else: return Dataset(self.x[idx])
        else:
            if y is not None: return self.x[idx], self.y[idx]
            else: return self.x[idx]
            
    def __len__(self):
        return len(self.x)

In [10]:
class Sampler:
    def __init__(self, ds, bs=64, shuffle=True, drop_last=True):
        print('sampler init, n: ', len(ds))
        self.n, self.bs, self.shuffle = len(ds), bs, shuffle
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        if drop_last: self.n = (self.n//self.bs)*self.bs

    def __iter__(self):
        for i in range(0, self.n, self.bs):
            yield self.idxs[i : min(self.n, i+self.bs)]

    def __len__(self):
        return (self.n-1)//self.bs + 1


def collate(batch):
    #print(batch)
    x, y = zip(*batch)
    x1, x2, x3 = zip(*x)
    x1, x2, x3, y = torch.stack(x1), torch.stack(x2), torch.stack(x3), torch.stack(y)
    return (x1, x2, x3), y

In [92]:
class Dataloader:
    def __init__(self, ds, bs=256, shuffle=True, collate_fn=collate, tfrm=None, drop_last=True):
        self.ds, self.bs, self.shuffle, self.collate_fn, self.tfrm = ds, bs, shuffle, collate_fn, tfrm
        self.sampler = Sampler(self.ds, self.bs, self.shuffle, drop_last)
        
    def _get(self, i):
        x, y = self.ds[i]
        mol = molecules_structure[x[0]]
        if self.tfrm: mol = self.tfrm(mol)
            
        indices = tensor(x[1:3].astype(np.int64))
        meta = tensor(x[3:5].astype(np.int64))
        
        return (indices, meta, mol), tensor(y)
        
    def __iter__(self):
        
        for idxs in self.sampler:
            yield self.collate_fn([self._get(i) for i in idxs])
            
    def __len__(self):
        return self.sampler.__len__()

In [94]:
ds = Dataset(x.values, y.values)

In [127]:
dl = Dataloader(ds[:11], bs=2, shuffle=False, drop_last=True)

init called
sampler init, n:  11


In [121]:
def tfrm(mol, p, q, natoms):
    perm = np.arange(30)
    perm[1: natoms+1] = np.random.permutation(np.arange(1, natoms+1))
    p, = np.where(perm==p)[0]
    q, = np.where(perm==q)[0]
    mol = mol[perm]
    return mol, p, q

In [110]:
for _ in range(10000):
    mol = np.ones((30, 16))*np.arange(30).reshape((30, 1))
    mol, p, q = tfrm(mol, 8, 1, 20)
    assert mol[p][0] == 8 and mol[q][0]==1, f'{p,mol[p][0], q, mol[q][0] }'

In [47]:
x = list(Sampler(range(100)))
torch.sort(x[0])[0]

sampler init, n:  100


tensor([ 0,  2,  4,  6,  7,  8, 11, 13, 18, 19, 20, 22, 24, 25, 26, 27, 29, 30,
        31, 32, 33, 34, 36, 37, 38, 40, 41, 43, 44, 45, 46, 47, 50, 51, 52, 58,
        60, 61, 62, 65, 67, 69, 72, 73, 74, 75, 76, 77, 79, 80, 82, 84, 85, 86,
        87, 88, 89, 90, 91, 92, 94, 96, 98, 99])