In [1]:
from utils.DevConf import DevConf
DEV_CONF = DevConf(device='cpu')

# Define Model

In [2]:
from transformers import AutoTokenizer, DistilBertModel
bertTokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
bertModel = DistilBertModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
bertModel.eval()

  from .autonotebook import tqdm as notebook_tqdm


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [3]:
from utils.AttnBlocksConf import AttnBlocksConf

In [4]:
from model.BertDecoder.SentiClassifier import SentiClassifier
from model.CombinationModel import CombinationModel
from torch import nn

In [5]:
from utils.const import BlockType

In [6]:
mapper = SentiClassifier(6, AttnBlocksConf(768, 12), BlockType.PARALLEL)
cModel = CombinationModel(
    distilBert=bertModel,
    decoder=mapper,
    outputProject=nn.Linear(768, 2),
    devConf=DEV_CONF)

# Load Data

In [7]:
from torchtext.datasets import IMDB
from torchtext.data.functional import to_map_style_dataset

In [8]:
trainData = iter(IMDB(root="./data", split='train'))

In [9]:
from torch.utils.data import DataLoader

In [10]:
import torch
from torch import Tensor

In [11]:
def collate_fn(batch)->tuple[Tensor, Tensor, Tensor]:
    target =[]
    datas = []

    for i, (label, data) in enumerate(batch):
        datas.append(data)
        target.append(label-1)

    encoding = bertTokenizer(datas, return_tensors='pt', padding=True, truncation=True, max_length=512).to(DEV_CONF.device)
    ids = encoding['input_ids']
    mask = encoding['attention_mask']
    
    return ids, mask, torch.tensor(target, dtype=torch.int16)

In [12]:
trainLoader = DataLoader(to_map_style_dataset(trainData), collate_fn=collate_fn, batch_size=16, shuffle=True)

In [13]:
next(iter(trainLoader))

(tensor([[  101, 11065, 21852,  ...,     0,     0,     0],
         [  101, 10117, 12839,  ...,     0,     0,     0],
         [  101,   138, 48333,  ...,   146, 18957,   102],
         ...,
         [  101, 10747, 10124,  ..., 10473, 25259,   102],
         [  101,   146, 10944,  ...,     0,     0,     0],
         [  101, 10117, 18379,  ...,     0,     0,     0]], device='mps:0'),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='mps:0'),
 tensor([0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0], dtype=torch.int16))

# Train

In [14]:
lr = 1e-5
epochs = 1

In [15]:
loss_fn = nn.CrossEntropyLoss()

In [16]:
optimizer = torch.optim.AdamW(cModel.parameters(), lr=lr)

In [17]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (x, y, z) in enumerate(dataloader):
        # x, y = x.to(devConf.device), y.to(devConf.device)
        z = z.to(DEV_CONF.device).to(torch.long)
        if x.shape[1] <= 512:
            # Compute prediction error
            pred = model(input_ids=x, attention_mask=y)
            loss = loss_fn(pred, z)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(x)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [18]:
cModel.train()
for i in range(epochs):
    print(f"Epoch {i + 1}\n-------------------------------")
    train(trainLoader, cModel, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 0.689786  [    0/25000]
Done!


# Evaluate

In [20]:
cModel.eval()

CombinationModel(
  (distilBert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [21]:
# neg
testInput = bertTokenizer("I think I will make a movie next weekend. Oh wait, I'm working..oh I'm sure I can fit it in. It looks like whoever made this film fit it in. I hope the makers of this crap have day jobs because this film sucked!!! It looks like someones home movie and I don't think more than $100 was spent making it!!! Total crap!!! Who let's this stuff be released?!?!?!",
              return_tensors='pt', padding=True, truncation=True).to(DEV_CONF.device)

In [22]:
cModel(**testInput)

tensor([[0.6433, 0.3567]], device='mps:0', grad_fn=<SoftmaxBackward0>)

In [None]:
# pos
testInput = bertTokenizer("Before Dogma 95: when Lars used movies as art, not just a story. A beautiful painting about love and death. This is one of my favorite movies of all time. The color... The music... Just perfect."
                          , return_tensors='pt', padding=True, truncation=True).to(DEV_CONF.device)

: 

In [None]:
cModel(**testInput)

tensor([[1.0809e-06, 1.0000e+00]], grad_fn=<SoftmaxBackward0>)

In [None]:
# neg
testInput = bertTokenizer("Ned aKelly is such an important story to Australians but this movie is awful. It's an Australian story yet it seems like it was set in America. Also Ned was an Australian yet he has an Irish accent...it is the worst film I have seen in a long time",
                          return_tensors='pt', padding=True, truncation=True).to(DEV_CONF.device)

In [None]:
cModel(**testInput)

tensor([[0.7629, 0.2371]], grad_fn=<SoftmaxBackward0>)

# Save Model

In [None]:
import torch

In [None]:
# torch.save(cModel.state_dict(), "./weights/cModel_state_dict.pth")

In [None]:
torch.save(mapper.state_dict(), "./weights/mapper_state_dict-gqa.pth")

In [None]:
# torch.save(bertModel.state_dict(), "./weights/bertModel_state_dict.pth")

In [None]:
torch.save(cModel.outProj.state_dict(), "./weights/outProj_state_dict-gqa.pth")

# Load Model

In [None]:
from transformers import DistilBertModel, AutoTokenizer

In [None]:
bertTokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
bertModel = DistilBertModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
# bertModel.load_state_dict(torch.load("./weights/bertModel_state_dict.pth"))
bertModel.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [None]:
from utils.const import BlockType
from utils.DevConf import DevConf
from utils.AttnBlocksConf import AttnBlocksConf
DEV_CONF = DevConf('cpu')

In [None]:
import torch
from torch import nn

In [None]:
testOutput = nn.Linear(768, 2, device=DEV_CONF.device)
testOutput.load_state_dict(torch.load("./weights/outProj_state_dict.pth"))

<All keys matched successfully>

In [None]:
from model.BertDecoder.SentiClassifier import SentiClassifier

In [None]:
testMapper = SentiClassifier(AttnBlocksConf(768, 12, 6), BlockType.PARALLEL,devConf=DEV_CONF)
testMapper.load_state_dict(torch.load("./weights/mapper_state_dict.pth"))

<All keys matched successfully>

In [None]:
from model.CombinationModel import CombinationModel

In [None]:
cModel = CombinationModel(
    distilBert=bertModel,
    decoder=testMapper,
    outputProject=testOutput)
cModel.eval()

CombinationModel(
  (distilBert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [None]:
# neg
testInput = bertTokenizer("I think I will make a movie next weekend. Oh wait, I'm working..oh I'm sure I can fit it in. It looks like whoever made this film fit it in. I hope the makers of this crap have day jobs because this film sucked!!! It looks like someones home movie and I don't think more than $100 was spent making it!!! Total crap!!! Who let's this stuff be released?!?!?!",
              return_tensors='pt', padding=True, truncation=True)

In [None]:
cModel(**testInput)

tensor([[9.9990e-01, 1.0072e-04]], grad_fn=<SoftmaxBackward0>)