# Define Model

In [16]:
from utils.DevConf import DevConf
DEV_CONF = DevConf(device='cpu')

In [17]:
from utils.AttnBlocksConf import AttnBlocksConf
from model.BertDecoder.SentiClassifier import SentiClassifier
from model.CombinationModel import CombinationModel
from utils.const import BlockType

In [79]:
mapper = SentiClassifier(6, AttnBlocksConf(768, 12, nKVHead=6), BlockType.CROSS)
model = CombinationModel(nClass=6, decoder=mapper, devConf=DEV_CONF)

# Load Data

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased", cache_dir='./cache/tokenizer')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [63]:
train = pd.read_csv('data/archive/train.csv')

In [50]:
example = tokenizer(
    # [train.iloc[0]['ABSTRACT'], train.iloc[1]['ABSTRACT']],
    train.iloc[0]['ABSTRACT'],
    return_tensors='pt',
    padding='max_length',
    # truncation=True,
    max_length=512)

In [51]:
model(**example)

tensor([[0.1975, 0.1718, 0.1658, 0.1919, 0.2730]], grad_fn=<SoftmaxBackward0>)

In [52]:
print(example)

{'input_ids': tensor([[   101,  90138, 106788,  11942,  22441,  21992,  20036,    118,  19182,
          10106,  52790,  10841, 106615,  10157,  19308,  21911,  16382,  42141,
          24965,  10106,  14861,  56859,  10369,  13808,  11165,    119,  90491,
            169,  20036,    112,    187,  11165,    117,  10106,  52790,  10944,
          10347,  11019,  10160,  10551,  21559,    131,  18331,    117,    177,
            119,    173,    119,  38938,  12752,  14403,  40018,  24713,  25599,
          10142,  10105,  20036,    117,  10111,  11436,    117,    177,    119,
            173,    119,  10349,  56906,  10230,  24713,  18514,  10135,  11948,
          16080,  94367,  19868,  27756,  10188,  10105,  20036,    112,    187,
          11165,    119,  14600,  18331,  10106,  52790,  10124,  24300,  11031,
            117,  11436,  10106,  52790,    117,  10319,  10944,  10347,  11031,
          10114,  12188,  20036,    118,  19182,  18514,  19350,    117,  10124,
          5787

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

In [67]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['ABSTRACT']
        label = torch.tensor([self.df.iloc[idx][i] for i in ["Computer Science","Physics","Mathematics","Statistics","Quantitative Biology","Quantitative Finance"]])
        return text, label
        # return self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128), label

In [71]:
def collect_fn(batch):
    texts, labels = zip(*batch)
    # print(texts)
    return tokenizer(texts, return_tensors='pt', padding='max_length', truncation=True, max_length=512), torch.stack(labels)

In [72]:
train_dataset = MyDataset(train, tokenizer)
train_loader = DataLoader(train_dataset, collate_fn=collect_fn, batch_size=8, shuffle=True)

In [73]:
print(train_dataset[0])

("  Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that can be used with different binary classifiers in a diag

In [74]:
next(iter(train_loader))

({'input_ids': tensor([[   101,  10117,  32818,  ...,      0,      0,      0],
         [   101,  10747,  17895,  ...,      0,      0,      0],
         [   101,  18742, 107114,  ...,      0,      0,      0],
         ...,
         [   101,  10167,  10531,  ...,      0,      0,      0],
         [   101,  12865,  14687,  ...,      0,      0,      0],
         [   101,  17144,    109,  ...,      0,      0,      0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])},
 tensor([[1, 0, 0, 0, 0, 0],
         [0, 0, 1, 1, 0, 0],
         [0, 0, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0],
         [0, 0, 1, 1, 0, 0],
         [0, 0, 1, 0, 0, 0]]))

# Train

In [19]:
from torch import nn

In [80]:
lr = 1e-5
epochs = 1
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [77]:
def train_fn(model, train_loader, loss_fn, optimizer, epochs):
    for epoch in range(epochs):
        for i, (data, label) in enumerate(train_loader):
            # print(data['input_ids'])
            # break
            optimizer.zero_grad()
            output = model(**data)
            loss = loss_fn(output, label.float())
            loss.backward()
            optimizer.step()
            print(f"Epoch {epoch+1}/{epochs} - Batch {i+1}/{len(train_loader)} - Loss: {loss.item()}")

In [81]:
model.train()
train_fn(model, train_loader, loss_fn, optimizer, epochs)

Epoch 1/1 - Batch 1/2622 - Loss: 0.7556924223899841
Epoch 1/1 - Batch 2/2622 - Loss: 0.7466787695884705
Epoch 1/1 - Batch 3/2622 - Loss: 0.7654260993003845
Epoch 1/1 - Batch 4/2622 - Loss: 0.7453203797340393
Epoch 1/1 - Batch 5/2622 - Loss: 0.7198200225830078
Epoch 1/1 - Batch 6/2622 - Loss: 0.7136686444282532
Epoch 1/1 - Batch 7/2622 - Loss: 0.747882604598999
Epoch 1/1 - Batch 8/2622 - Loss: 0.715918779373169
Epoch 1/1 - Batch 9/2622 - Loss: 0.7340836524963379
Epoch 1/1 - Batch 10/2622 - Loss: 0.7133667469024658
Epoch 1/1 - Batch 11/2622 - Loss: 0.7522581219673157
Epoch 1/1 - Batch 12/2622 - Loss: 0.7136598229408264
Epoch 1/1 - Batch 13/2622 - Loss: 0.6935104727745056
Epoch 1/1 - Batch 14/2622 - Loss: 0.7332396507263184
Epoch 1/1 - Batch 15/2622 - Loss: 0.7126179337501526
Epoch 1/1 - Batch 16/2622 - Loss: 0.7741646766662598
Epoch 1/1 - Batch 17/2622 - Loss: 0.7133728861808777
Epoch 1/1 - Batch 18/2622 - Loss: 0.692919909954071
Epoch 1/1 - Batch 19/2622 - Loss: 0.7539355754852295
Epoch

KeyboardInterrupt: 