# Define Model

In [1]:
from transformers import AutoTokenizer, DistilBertModel
bertTokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
bertModel = DistilBertModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
bertModel.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [2]:
from utils.DevConf import DevConf
devConf = DevConf('cpu')

In [3]:
from utils.AttnBlocksConf import AttnBlocksConf

In [4]:
from model.BertDecoder.SentiClassifier import SentiClassifier
from model.CombinationModel import CombinationModel
from torch import nn

In [5]:
from utils.const import BlockType

In [6]:
mapper = SentiClassifier(AttnBlocksConf(768, 12, 6), BlockType.PARALLEL,devConf=devConf)
cModel = CombinationModel(
    distilBert=bertModel,
    decoder=mapper,
    outputProject=nn.Linear(768, 2, device=devConf.device))

# Load Data

In [7]:
from torchtext.datasets import IMDB
from torchtext.data.functional import to_map_style_dataset

In [8]:
trainData = iter(IMDB(root="./data", split='train'))

In [9]:
from torch.utils.data import DataLoader

In [10]:
import torch
from torch import Tensor

In [11]:
def collate_fn(batch)->tuple[Tensor, Tensor, Tensor]:
    target =[]
    datas = []

    for i, (label, data) in enumerate(batch):
        datas.append(data)
        target.append(0 if label == 'neg' else 1)

    encoding = bertTokenizer(datas, return_tensors='pt', padding=True, truncation=True, max_length=512)
    ids = encoding['input_ids']
    mask = encoding['attention_mask']
    
    return ids, mask, torch.tensor(target, dtype=torch.int16)

In [12]:
trainLoader = DataLoader(to_map_style_dataset(trainData), collate_fn=collate_fn, batch_size=16, shuffle=True)

# Train

In [13]:
lr = 1e-5
epochs = 1

In [14]:
loss_fn = nn.CrossEntropyLoss()

In [15]:
optimizer = torch.optim.AdamW(cModel.parameters(), lr=lr)

In [16]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (x, y, z) in enumerate(dataloader):
        x, y = x.to('cpu'), y.to('cpu')
        z = z.to(devConf.device).to(torch.long)
        if x.shape[1] <= 512:
            # Compute prediction error
            pred = model(input_ids=x, attention_mask=y)
            loss = loss_fn(pred, z)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(x)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [17]:
for i in range(epochs):
    print(f"Epoch {i + 1}\n-------------------------------")
    train(trainLoader, cModel, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 0.573557  [    0/25000]
loss: 0.313269  [ 1600/25000]
loss: 0.313269  [ 3200/25000]
loss: 0.313268  [ 4800/25000]
loss: 0.313267  [ 6400/25000]
loss: 0.313267  [ 8000/25000]


KeyboardInterrupt: 

# Evaluate

In [18]:
# neg
testInput = bertTokenizer("I think I will make a movie next weekend. Oh wait, I'm working..oh I'm sure I can fit it in. It looks like whoever made this film fit it in. I hope the makers of this crap have day jobs because this film sucked!!! It looks like someones home movie and I don't think more than $100 was spent making it!!! Total crap!!! Who let's this stuff be released?!?!?!",
              return_tensors='pt', padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [19]:
cModel.eval()

CombinationModel(
  (distilBert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [20]:
cModel(**testInput)

tensor([[8.2847e-06, 9.9999e-01]], grad_fn=<SoftmaxBackward0>)

# Save Model

In [None]:
import torch

In [29]:
# torch.save(cModel.state_dict(), "./weights/cModel_state_dict.pth")

In [21]:
torch.save(mapper.state_dict(), "./weights/mapper_state_dict.pth")

In [31]:
# torch.save(bertModel.state_dict(), "./weights/bertModel_state_dict.pth")

In [22]:
torch.save(cModel.outProj.state_dict(), "./weights/outProj_state_dict.pth")

# Load Model

In [1]:
from transformers import DistilBertModel, AutoTokenizer

In [2]:
bertTokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
bertModel = DistilBertModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
# bertModel.load_state_dict(torch.load("./weights/bertModel_state_dict.pth"))
bertModel.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [3]:
from utils.const import BlockType
from utils.DevConf import DevConf
from utils.AttnBlocksConf import AttnBlocksConf
devConf = DevConf('cpu')

In [4]:
import torch
from torch import nn

In [10]:
testOutput = nn.Linear(768, 2, device=devConf.device)
testOutput.load_state_dict(torch.load("./weights/outProj_state_dict.pth"))

<All keys matched successfully>

In [6]:
from model.BertDecoder.SentiClassifier import SentiClassifier

In [18]:
testMapper = SentiClassifier(AttnBlocksConf(768, 12, 6), BlockType.PARALLEL,devConf=devConf)
testMapper.load_state_dict(torch.load("./weights/mapper_state_dict.pth"))

<All keys matched successfully>

In [8]:
from model.CombinationModel import CombinationModel

In [19]:
cModel = CombinationModel(
    distilBert=bertModel,
    decoder=testMapper,
    outputProject=testOutput)
cModel.eval()

CombinationModel(
  (distilBert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [16]:
# neg
testInput = bertTokenizer("I think I will make a movie next weekend. Oh wait, I'm working..oh I'm sure I can fit it in. It looks like whoever made this film fit it in. I hope the makers of this crap have day jobs because this film sucked!!! It looks like someones home movie and I don't think more than $100 was spent making it!!! Total crap!!! Who let's this stuff be released?!?!?!",
              return_tensors='pt', padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [20]:
cModel(**testInput)

tensor([[0.5647, 0.4353]], grad_fn=<SoftmaxBackward0>)