In [1]:
import pandas as pd

In [2]:
pretrain_df = pd.read_csv('./input/pretrain/train.csv')

In [3]:
list(filter(bool, 'Line 1\n\nLine 3\rLine 4\r\n'.splitlines()))

['Line 1', 'Line 3', 'Line 4']

In [4]:
train_lines = []

for text in pretrain_df.full_text:
    train_lines += list(filter(bool, text.splitlines()));
    
train_lines = pd.DataFrame(train_lines, columns = ['train_lines'])
train_lines.to_csv('./input/pretrain/train_MLM.csv', index = False)

In [5]:
import torch
import pandas as pd;
import numpy as np;
import os
from torch.utils.data import Dataset, DataLoader
from torch.utils.checkpoint import checkpoint
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import AdamW
from tqdm import tqdm
import os

In [6]:
class CFG:
    seed = 42;
    model_name = 'microsoft/deberta-v3-large'
    epochs = 3;
    batch_size = 4;
    lr = 1e-6;
    weight_decay = 1e-6
    max_len = 512
    mask_prob = 0.15;
    n_accumulate = 4
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
import numpy as np;
import os
def set_seed(seed = CFG.seed):
    np.random.seed(seed);
    torch.manual_seed(seed);
    torch.cuda.manual_seed(seed);
    torch.backends.cudnn.deterministic = True;
    torch.backends.cudnn.benchmark = True;
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name);
model = AutoModelWithLMHead.from_pretrained(CFG.model_name);

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
special_tokens = tokenizer.encode_plus('[CLS] [SEP] [MASK] [PAD]',
                                      add_special_tokens = False,
                                      return_tensors='pt')
special_tokens = torch.flatten(special_tokens['input_ids'])
special_tokens

tensor([     1,      2, 128000,      0])

In [10]:
def getMaskedLabels(input_ids):
    rand = torch.rand(input_ids.shape);
    mask_arr = (rand < CFG.mask_prob);
    
    for special_token in special_tokens:
        token = special_token.item();
        mask_arr *= (input_ids != token);
    selection = torch.flatten(mask_arr[0].nonzero()).tolist()
    input_ids[selection] = 128000
    
    return input_ids

In [11]:
class MLMDataset:
    def __init__(self, data, tokenizer):
        self.data = data;
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data);
    
    def __getitem__(self, idx):
        text = self.data[idx]
        
        tokenized_data = self.tokenizer.encode_plus(
                            text,
                            max_length = CFG.max_len,
                            truncation = True,
                            padding = 'max_length',
                            add_special_tokens = True,
                            return_tensors = 'pt'
                        )
        input_ids = torch.flatten(tokenized_data.input_ids);
        attention_mask = torch.flatten(tokenized_data.input_ids);
        labels = getMaskedLabels(input_ids)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [12]:
train_lines.train_lines.unique()

array(["I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there house they'll be pay more attention. they will be comfortable at home.",
       "The hardest part of school is getting ready. you wake up go brush your teeth and go to your closet and look at your cloths. after you think you picked a outfit u go look in the mirror and youll either not like it or you look and see a stain. Then you'll have to change. with the online classes you can wear anything and stay home and you wont need to stress about what to wear.",
       'most students usually take showers before school. they either take it before they sleep or when they wake up. some students do both to smell good. that causes them do miss the bus and effects on there lesson time cause they come late to school. when u have online classes u wont need to miss lessons cause you can get every

In [13]:
train_data = MLMDataset(train_lines.train_lines.unique(), tokenizer)
dataloader = DataLoader(train_data, batch_size = CFG.batch_size, shuffle = True)

In [14]:
len(train_data), len(dataloader)

(21386, 5347)

In [15]:
optimizer = AdamW(model.parameters(), lr = CFG.lr, weight_decay = CFG.weight_decay);



In [16]:
def train_loop(model, device):
    model.train()
    batch_losses = []
    loop = tqdm(dataloader, leave=True)
    for batch_num, batch in enumerate(loop):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        batch_loss = loss / CFG.n_accumulate
        batch_losses.append(batch_loss.item())
    
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=batch_loss.item())
        batch_loss.backward()
        
        if batch_num % CFG.n_accumulate == 0 or batch_num == len(dataloader):
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            optimizer.step()
            model.zero_grad()

    return np.mean(batch_losses)

In [17]:
device = CFG.device
model.to(device)
history = []
best_loss = np.inf
prev_loss = np.inf
model.gradient_checkpointing_enable()
print(f"Gradient Checkpointing: {model.is_gradient_checkpointing}")

for epoch in range(CFG.epochs):
    loss = train_loop(model, device)
    history.append(loss)
    print(f"Loss: {loss}")
    if loss < best_loss:
        print("New Best Loss {:.4f} -> {:.4f}, Saving Model".format(prev_loss, loss))
        # torch.save(model.state_dict(), "./deberta_mlm.pt")
        model.save_pretrained('./input/pretrain/pretrained_model/')
        best_loss = loss
    prev_loss = loss

Gradient Checkpointing: True


Epoch 1: 100%|██████████| 5347/5347 [1:22:51<00:00,  1.08it/s, loss=0.553]


Loss: 1.1724517982051108
New Best Loss inf -> 1.1725, Saving Model


Epoch 2: 100%|██████████| 5347/5347 [1:22:50<00:00,  1.08it/s, loss=0.521]


Loss: 0.4743986433357619
New Best Loss 1.1725 -> 0.4744, Saving Model


Epoch 3: 100%|██████████| 5347/5347 [1:22:50<00:00,  1.08it/s, loss=0.311]


Loss: 0.33300676586829947
New Best Loss 0.4744 -> 0.3330, Saving Model


In [29]:
device

device(type='cuda')

### Try to load the model

In [20]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained('./input/pretrain/pretrained_model/')

In [22]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('./input/pretrain/pretrained_model/', config = config)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ./input/pretrain/pretrained_model/ and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
