# Define Model

In [1]:
from utils.DevConf import DevConf
DEV_CONF = DevConf(device='cuda')

In [2]:
from utils.AttnBlocksConf import AttnBlocksConf
from model.BertDecoder.SentiClassifier import SentiClassifier
from model.CombinationModel import CombinationModel
from utils.const import BlockType

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
mapper = SentiClassifier(6, AttnBlocksConf(768, 12, nKVHead=6), BlockType.CROSS)
model = CombinationModel(nClass=6, decoder=mapper, devConf=DEV_CONF)



In [4]:
print(model.decoder._Q)

Parameter containing:
tensor([[ 6.0395e-01, -8.1428e-01, -1.3039e+00,  7.6129e-01, -6.2261e-01,
         -3.5434e-01,  2.9630e-01,  2.2628e-01,  6.9910e-01, -2.7239e-01,
         -6.3720e-01,  7.7931e-02, -7.7593e-01,  1.1518e-01, -1.9948e+00,
         -1.2922e-01,  2.4402e-01,  4.2880e-02, -3.7178e-01,  5.0487e-01,
         -6.2309e-01,  6.4418e-01,  7.8791e-01, -1.3183e+00, -2.6399e-01,
          6.3105e-01, -7.6285e-01, -2.2449e-01,  6.4259e-01, -4.4549e-01,
         -4.4144e-01,  1.8294e-02, -1.2467e+00, -4.8446e-01, -4.3753e-01,
         -5.5458e-01, -1.4395e+00, -6.4573e-01, -8.7884e-01,  5.6430e-01,
          1.0989e-01,  4.4094e-01,  7.1416e-01, -7.7362e-01,  1.2592e+00,
          2.8197e-01, -8.2943e-01, -7.0778e-01,  1.4155e+00, -1.3312e+00,
         -1.4538e-01,  3.8820e-01,  9.2164e-01,  1.0232e+00,  1.4780e-01,
          7.7785e-01,  2.6167e-01,  7.4583e-01,  4.7644e-01, -2.8126e-01,
          8.0821e-01, -2.2247e+00,  1.8166e-01, -1.1770e+00, -2.0425e+00,
          1.2166

# Load Data

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased", cache_dir='./cache/tokenizer')

In [6]:
import pandas as pd

In [7]:
train = pd.read_csv('data/archive/train.csv')

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [9]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # print(idx)
        text = self.df.iloc[idx]['ABSTRACT']
        label = torch.tensor([self.df.iloc[idx][i] for i in ["Computer Science","Physics","Mathematics","Statistics","Quantitative Biology","Quantitative Finance"]])
        return text, label
        # return self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128), label

In [10]:
def collect_fn(batch):
    texts, labels = zip(*batch)
    # print(texts)
    return tokenizer(texts, return_tensors='pt', padding='max_length', truncation=True, max_length=512).to(device=DEV_CONF.device), torch.stack(labels).to(DEV_CONF.device)

In [11]:
dataset = MyDataset(train, tokenizer)

datasize = len(dataset)
splitIndex = int(datasize * 0.2)
trainDataSize = datasize - splitIndex

train_dataset, test_dataset = random_split(dataset, [trainDataSize, splitIndex])

train_loader = DataLoader(train_dataset, collate_fn=collect_fn, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, collate_fn=collect_fn, batch_size=8, shuffle=True)

# Train

In [12]:
from torch import nn

In [13]:
lr = 1e-5
epochs = 1
loss_fn = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [14]:
def train_fn(model, train_loader, loss_fn, optimizer, epochs):
    for epoch in range(epochs):
        for i, (data, label) in enumerate(train_loader):
            # print(data['input_ids'])
            # break
            optimizer.zero_grad()
            output = model(**data)#, NoGradBert=False)
            loss = loss_fn(output, label.float())
            loss.backward()
            optimizer.step()
            if i % 100 == 99:
                print(f"Epoch {epoch+1}/{epochs} - Batch {i+1}/{len(train_loader)} - Loss: {loss.item()}")

In [15]:
model.train()
train_fn(model, train_loader, loss_fn, optimizer, epochs)
print("Done")

Epoch 1/1 - Batch 100/2098 - Loss: 0.36848533153533936
Epoch 1/1 - Batch 200/2098 - Loss: 0.321755975484848
Epoch 1/1 - Batch 300/2098 - Loss: 0.5310551524162292
Epoch 1/1 - Batch 400/2098 - Loss: 0.26477938890457153
Epoch 1/1 - Batch 500/2098 - Loss: 0.2662111222743988
Epoch 1/1 - Batch 600/2098 - Loss: 0.2252422571182251
Epoch 1/1 - Batch 700/2098 - Loss: 0.14750906825065613
Epoch 1/1 - Batch 800/2098 - Loss: 0.22241860628128052
Epoch 1/1 - Batch 900/2098 - Loss: 0.31430137157440186
Epoch 1/1 - Batch 1000/2098 - Loss: 0.1819993257522583
Epoch 1/1 - Batch 1100/2098 - Loss: 0.37738537788391113
Epoch 1/1 - Batch 1200/2098 - Loss: 0.2848208546638489
Epoch 1/1 - Batch 1300/2098 - Loss: 0.2475285530090332
Epoch 1/1 - Batch 1400/2098 - Loss: 0.13623857498168945
Epoch 1/1 - Batch 1500/2098 - Loss: 0.2045365869998932
Epoch 1/1 - Batch 1600/2098 - Loss: 0.2803414762020111
Epoch 1/1 - Batch 1700/2098 - Loss: 0.2721858024597168
Epoch 1/1 - Batch 1800/2098 - Loss: 0.3539148271083832
Epoch 1/1 - B

In [16]:
print(model.decoder._Q)

Parameter containing:
tensor([[ 6.0368e-01, -8.1372e-01, -1.3034e+00,  7.6100e-01, -6.2226e-01,
         -3.5416e-01,  2.9610e-01,  2.2619e-01,  6.9881e-01, -2.7238e-01,
         -6.3697e-01,  7.7920e-02, -7.7544e-01,  1.1514e-01, -1.9940e+00,
         -1.2938e-01,  2.4391e-01,  4.2925e-02, -3.7178e-01,  5.0495e-01,
         -6.2293e-01,  6.4387e-01,  7.8746e-01, -1.3177e+00, -2.6376e-01,
          6.3053e-01, -7.6249e-01, -2.2444e-01,  6.4225e-01, -4.4528e-01,
         -4.4119e-01,  1.8330e-02, -1.2461e+00, -4.8435e-01, -4.3747e-01,
         -5.5416e-01, -1.4390e+00, -6.4568e-01, -8.7843e-01,  5.6404e-01,
          1.0990e-01,  4.4082e-01,  7.1394e-01, -7.7282e-01,  1.2581e+00,
          2.8174e-01, -8.2885e-01, -7.0741e-01,  1.4150e+00, -1.3309e+00,
         -1.4537e-01,  3.8782e-01,  9.2136e-01,  1.0229e+00,  1.4751e-01,
          7.7730e-01,  2.6152e-01,  7.4567e-01,  4.7614e-01, -2.8120e-01,
          8.0779e-01, -2.2239e+00,  1.8151e-01, -1.1765e+00, -2.0418e+00,
          1.2163

# Eval

In [17]:
model.eval()

CombinationModel(
  (distilBert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [18]:
id = 13
sample = tokenizer(dataset[id][0], return_tensors='pt', padding='max_length', truncation=True, max_length=512).to(device=DEV_CONF.device)
output = torch.where(model(**sample) > 0.2, 1, 0)
print(output)
print(dataset[id][1])
ans = torch.eq(output.to("cpu"), dataset[id][1])
print(ans)
print(torch.all(ans))

tensor([[0, 0, 1, 0, 0, 0]], device='cuda:0')
tensor([1, 0, 0, 0, 0, 0])
tensor([[False,  True, False,  True,  True,  True]])
tensor(False)


In [19]:
# acc = [0] * 6
# testdata = train.sample(1000)
# test_dataset = MyDataset(testdata, tokenizer)

# for data in test_dataset:
#     # print(data[1][0])
#     sample = tokenizer(data[0], return_tensors='pt', padding='max_length', truncation=True, max_length=512).to(device=DEV_CONF.device)
#     output = torch.where(model(**sample) > 0.2, 1, 0)
#     ansList = torch.eq(output.squeeze().to("cpu"), data[1])
#     for i, ans in enumerate(ansList):
#         # print(ans)
#         if ans:
#             acc[i] += 1

In [24]:
%pip install scikit-learn




In [None]:
from sklearn.metrics import confusion_matrix

In [20]:
def test(model, test_loader):
    acc = [0] * 6
    for (data, label) in test_loader:
        output = torch.where(model(**data) > 0.2, 1, 0)
        ansList = torch.eq(output, label)
        # print(ansList)
        for ans in ansList:
            for i, a in enumerate(ans):
                if a:
                    acc[i] += 1
    return acc

In [21]:
acc = test(model, test_loader)

KeyboardInterrupt: 

In [None]:
print([i / splitIndex for i in acc])

[0.8254649499284692, 0.8774439675727229, 0.8438245112064855, 0.8073438245112065, 0.9737720553171197, 0.9928469241773963]


- noGradBert

[0.8483547925608012, 0.8457319980925131, 0.8612303290414879, 0.757033857892227, 0.9644730567477349, 0.9895088221268479]

- GradBert

[0.8254649499284692, 0.8774439675727229, 0.8438245112064855, 0.8073438245112065, 0.9737720553171197, 0.9928469241773963]

In [None]:
# torch.save(model.state_dict(), 'weights/model-240515-1357.pth')

In [None]:
# model.load_state_dict(torch.load('weights/model-240515-1336.pth'))

<All keys matched successfully>