In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import Trainer, TrainingArguments
from transformers import AutoModel, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

import datasets
from datasets import load_dataset, Dataset, DatasetDict

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score



In [2]:
import warnings, logging, os, gc
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [3]:
USE_WANDB = False

if USE_WANDB:
    import wandb
    os.environ['WANDB_PROJECT'] = 'imdb'

In [4]:
data_dir = 'data'
ckpt_base_dir = 'outputs'

In [5]:
model_name = 'microsoft/deberta-v3-small'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
ds = load_dataset('imdb')
train_tok_ds = ds['train'].map(
    lambda x: tokenizer(x['text'], max_length=1024, truncation=True), 
    batched=True,
    remove_columns='text'
)
test_tok_ds = ds['test'].map(
    lambda x: tokenizer(x['text'], max_length=1024, truncation=True), 
    batched=True,
    remove_columns='text'
)
train_tok_ds

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [8]:
max([len(x) for x in train_tok_ds['input_ids']])

1024

In [9]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [10]:
from transformers.modeling_outputs import SequenceClassifierOutput

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class TransformerModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({
            'output_hidden_states': False,
        })
        hidden_size = self.config.hidden_size
        self.transformer = AutoModel.from_pretrained(model_name, config=self.config)

        self.dropout = nn.Dropout(0.1)

        self.pooler = MeanPooling()

        self.out = nn.Linear(hidden_size, 1)
        
        self.loss_fn = nn.BCEWithLogitsLoss()
                
    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        output = self.transformer(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        output = output.last_hidden_state

        output = self.pooler(output, attention_mask)
        output = self.dropout(output)

        logits = self.out(output)
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.squeeze(), labels.float())
                                
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits.sigmoid()
        )

In [11]:
def accuracy_metric(eval_pred): 
    predictions, labels = eval_pred
    predictions = (predictions > 0.5).astype(int)
    return {'accuracy': accuracy_score(labels, predictions)}

In [12]:
lr,bs,accum = 3e-5,4,2
wd,epochs = 0.01,1
grad_checkpoint = False
freeze_layers = False

In [13]:
exp_name = 'baseline'

In [14]:
def get_trainer(dds):
    collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    args = TrainingArguments(
        f'{ckpt_base_dir}/{model_name}-{exp_name}-fold-{i}',
        learning_rate=lr, warmup_ratio=0., 
        gradient_accumulation_steps=accum,
        lr_scheduler_type='cosine', 
        fp16=True,
        fp16_full_eval=True,
        evaluation_strategy="epoch", 
        logging_strategy='epoch',
        per_device_train_batch_size=bs, 
        per_device_eval_batch_size=bs*2,
        greater_is_better=True, 
        group_by_length=True,
        num_train_epochs=epochs, 
        weight_decay=wd, 
        report_to='wandb' if USE_WANDB else 'none', 
        run_name=f'{model_name}/{exp_name}/fold-{i}', 
        save_strategy='no',
        save_total_limit=1,
        dataloader_num_workers=4,
        dataloader_pin_memory=False,
    )
    
    model = TransformerModel(model_name)

    if grad_checkpoint:
        model.transformer.gradient_checkpointing_enable()
    
    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokenizer, data_collator=collator, compute_metrics=accuracy_metric)

cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
idxs = np.arange(len(train_tok_ds))

for i, (train_idx, val_idx) in enumerate(cv.split(idxs, train_tok_ds['label'])):
    # just using a single model
    if i != 0: continue

    if USE_WANDB:
        wandb.init(
            project=os.environ['WANDB_PROJECT'],
            name=f'{model_name}/{exp_name}/fold-{i}',
            group=f'{model_name}/{exp_name}',
            save_code=True
        )
    dds = DatasetDict({
        "train": train_tok_ds.select(train_idx), 
        "test": train_tok_ds.select(val_idx)
    })

    trainer = get_trainer(dds)

    trainer.train();
    
    torch.cuda.empty_cache()
    gc.collect();

Epoch,Training Loss,Validation Loss,Accuracy
0,0.2368,0.194979,0.95


In [15]:
trainer.predict(test_tok_ds).metrics['test_accuracy']

0.9524