In [1]:
import numpy as np
import pandas as pd
import fastai
from tqdm import tqdm_notebook as tqdm
from fastai.tabular import *
import pickle

from multiprocessing import Pool
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
np.range = (lambda x:(x.min(), x.max()))

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
structures = pd.read_csv('structures.csv')

In [3]:
train['type_i'] = train.type.apply(lambda x: int(list(x)[0])-1)
train['type_a'] = train.type.apply(lambda x: list(x)[-1])

test['type_i'] = test.type.apply(lambda x: int(list(x)[0])-1)
test['type_a'] = test.type.apply(lambda x: list(x)[-1])

type_a_encoder = LabelEncoder().fit(train.type_a)
train.type_a = type_a_encoder.transform(train.type_a)
test.type_a = type_a_encoder.transform(test.type_a)

train = train.drop(columns=['type'])
train.atom_index_0 += 1
train.atom_index_1 += 1

test = test.drop(columns=['type'])
test.atom_index_0 += 1
test.atom_index_1 += 1

In [4]:
with open('molecules.pkl', 'rb') as f:
    molecules_structure, structure_cols, atom_encoder = pickle.load(f)

In [7]:
# molecules_structue = pd.DataFrame.from_dict(molecules_structure, 'index')
# molecules_structue.reset_index(inplace=True)
# molecules_structue.columns = ['molecule_name', 'natoms', 'structure']

# train = train.merge(molecules_structue, on='molecule_name')
# train['target'] = train.scalar_coupling_constant
# train.drop(columns=['scalar_coupling_constant'], inplace = True)
# train.head()

# test = test.merge(molecules_structue, on='molecule_name')
# test.head()

In [7]:
train.shape, test.shape

((4658147, 7), (2505542, 6))

In [8]:
with open('train_test_pre.pkl', 'wb') as f:
    pickle.dump([train, test], f)

In [9]:
print(train.type_i.value_counts()/len(train)*100,'\n\n', train.type_a.value_counts()/len(train)*100)

2    48.676115
1    35.163403
0    16.160482
Name: type_i, dtype: float64 

 0    72.141755
1    20.794685
2     7.063560
Name: type_a, dtype: float64


In [10]:
print(np.range(train.atom_index_0.unique()), np.range(train.atom_index_1.unique()))

(1, 29) (1, 29)


In [11]:
with open('molecules.pkl', 'rb') as f:
    molecules_structure, structure_cols, atom_encoder = pickle.load(f)

In [12]:
x = train.drop(columns=['scalar_coupling_constant', 'id'])
y = train.scalar_coupling_constant

In [13]:
print(x.head().to_string(), '\n\n', y.head().to_string())

      molecule_name  atom_index_0  atom_index_1  type_i  type_a
0  dsgdb9nsd_000001             2             1       0       0
1  dsgdb9nsd_000001             2             3       1       1
2  dsgdb9nsd_000001             2             4       1       1
3  dsgdb9nsd_000001             2             5       1       1
4  dsgdb9nsd_000001             3             1       0       0 

 0    84.8076
1   -11.2570
2   -11.2548
3   -11.2543
4    84.8074


In [14]:
class Dataset:
    def __init__(self, x, y=None):
        assert (y is None) or (len(x) == len(y))
        self.x, self.y = x, y
        
    def __getitem__(self, idx):
        if isinstance(idx, slice):
            if y is not None: return Dataset(self.x[idx], self.y[idx])
            else: return Dataset(self.x[idx])
        else:
            if y is not None: return self.x[idx], self.y[idx]
            else: return self.x[idx]
            
    def __len__(self):
        return len(self.x)

In [82]:
class Sampler:
    def __init__(self, ds, bs=64, shuffle=True, drop_last=True):
        print('sampler init, n: ', len(ds))
        self.n, self.bs, self.shuffle = len(ds), bs, shuffle
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        if drop_last: self.n = (self.n//self.bs)*self.bs

    def __iter__(self):
        for i in range(0, self.n, self.bs):
            yield self.idxs[i : min(self.n, i+self.bs)]

    def __len__(self):
        return (self.n-1)//self.bs + 1


def collate(batch):
    #print(batch)
    x, y = zip(*batch)
    x1, x2, x3 = zip(*x)
    #print(x1, x2, x3, y)
    x1, x2, x3, y = torch.stack(x1), torch.stack(x2), torch.stack(x3), torch.stack(y)
    return (x1, x2, x3), y

In [83]:
class Dataloader:
    def __init__(self, ds, bs=256, shuffle=True, collate_fn=collate, tfrm=None, drop_last=True):
        self.ds, self.bs, self.shuffle, self.collate_fn, self.tfrm = ds, bs, shuffle, collate_fn, tfrm
        self.sampler = Sampler(self.ds, self.bs, self.shuffle, drop_last)
        
    def _get(self, i):
        x, y = self.ds[i]
        #print(x, y)
        mol = molecules_structure[x[0]][1]
        natoms = molecule_structure[x[0]][0]
        if self.tfrm: mol = self.tfrm(mol)
            
        indices = tensor(x[1:3].astype(np.int64))
        meta = tensor(x[3:5].astype(np.int64))
        
        return (indices, meta, mol), tensor(y)
        
    def __iter__(self):
        
        for idxs in self.sampler:
            yield self.collate_fn([self._get(i) for i in idxs])
            
    def __len__(self):
        return self.sampler.__len__()

In [84]:
ds = Dataset(x.values, y.values)

In [85]:
dl = Dataloader(ds[:11], bs=3, shuffle=False, drop_last=True)

sampler init, n:  11


In [86]:
def tfrm(mol, p, q, natoms):
    perm = np.arange(30)
    perm[1: natoms+1] = np.random.permutation(np.arange(1, natoms+1))
    p, = np.where(perm==p)[0]
    q, = np.where(perm==q)[0]
    mol = mol[perm]
    return mol, p, q

In [87]:
it = iter(dl)

In [88]:
next(it)

((tensor([[2, 1],
          [2, 3],
          [2, 4]]), tensor([[0, 0],
          [1, 1],
          [1, 1]]), tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [ 1.0000, -0.0650,  0.7136,  ...,  1.0000, -0.9759, -1.0667],
           [ 3.0000, -0.0560,  0.1647,  ...,  1.0000, -0.9882, -2.0328],
           ...,
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
  
          [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [ 1.0000, -0.0650,  0.7136,  ...,  1.0000, -0.9759, -1.0667],
           [ 3.0000, -0.0560,  0.1647,  ...,  1.0000, -0.9882, -2.0328],
           ...,
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,

In [22]:
ds[0]

(array(['dsgdb9nsd_000001', 2, 1, 0, 0], dtype=object), 84.8076)

In [None]:
def get_dls()

In [90]:
Learner??

[0;31mInit signature:[0m
[0mLearner[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m:[0m[0mfastai[0m[0;34m.[0m[0mbasic_data[0m[0;34m.[0m[0mDataBunch[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mmodules[0m[0;34m.[0m[0mmodule[0m[0;34m.[0m[0mModule[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mopt_func[0m[0;34m:[0m[0mCallable[0m[0;34m=[0m[0mfunctools[0m[0;34m.[0m[0mpartial[0m[0;34m([0m[0;34m<[0m[0;32mclass[0m [0;34m'torch.optim.adam.Adam'[0m[0;34m>[0m[0;34m,[0m [0mbetas[0m[0;34m=[0m[0;34m([0m[0;36m0.9[0m[0;34m,[0m [0;36m0.99[0m[0;34m)[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloss_func[0m[0;34m:[0m[0mCallable[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetrics[0m[0;34m:[0m[0mCollection[0m[0;34m[[0m[0mCallable[0m[0;34m][0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    