In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install -q transformers

!nvidia-smi

Mounted at /content/drive
Thu Jul 28 05:12:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------------------------------

# EarlyStopping

In [2]:
import copy
import torch
import numpy as np

class EarlyStopping:
    def __init__(self, patience=6, mode="max", max_epoch=1e6, min_epoch=0, at_last_score=None):
        self.patience = patience
        self.mode = mode
        self.max_epoch = max_epoch
        self.min_epoch = min_epoch
        self.at_last_score = at_last_score if at_last_score is not None else -np.Inf 
        self.epoch = 0
        self.early_stop = False
        self.best_model = None
        self.best_epoch = 0
        self.model_path = None
        self.best_score = -np.Inf if self.mode == "max" else np.Inf

    def __call__(self, epoch_score, model=None, model_path=None):
        self.model_path = model_path
        self.epoch += 1

        score = -epoch_score if self.mode == "min" else epoch_score
        
        if score <= self.best_score: 
            counter = self.epoch - self.best_epoch
            print('EarlyStopping counter: {} out of {}'.format(counter, self.patience))
            if (counter >= self.patience) and (self.best_score > self.at_last_score) and (self.epoch >= self.min_epoch):
                self.early_stop = True 
                self._save_checkpoint()
        else:                    
            self.best_score = score 
            self.best_epoch = self.epoch
            self.best_model = copy.deepcopy(model).cpu()
        
        if self.max_epoch <= self.epoch:
            self.early_stop = True 
            self._save_checkpoint()

    def _save_checkpoint(self):
        if self.model_path is not None and self.best_model is not None:
            torch.save(self.best_model.state_dict(), self.model_path.replace('_score','_'+str(self.best_score)))
            print('model saved at: ',self.model_path.replace('_score','_'+str(self.best_score)))

# CosineAnnealingWarmupRestarts

In [3]:
import math
import torch
from torch.optim.lr_scheduler import _LRScheduler

class CosineAnnealingWarmupRestarts(_LRScheduler):
    """
        optimizer (Optimizer): Wrapped optimizer.
        first_cycle_steps (int): First cycle step size.
        cycle_mult(float): Cycle steps magnification. Default: -1.
        max_lr(float): First cycle's max learning rate. Default: 0.1.
        min_lr(float): Min learning rate. Default: 0.001.
        warmup_steps(int): Linear warmup step size. Default: 0.
        gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
        last_epoch (int): The index of last epoch. Default: -1.
    """

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 first_cycle_steps: int,
                 cycle_mult: float = 1.,
                 max_lr: float = 0.1,
                 min_lr: float = 0.001,
                 warmup_steps: int = 0,
                 gamma: float = 1.,
                 last_epoch: int = -1):
        assert warmup_steps < first_cycle_steps

        self.first_cycle_steps = first_cycle_steps  # first cycle step size
        self.cycle_mult = cycle_mult  # cycle steps magnification
        self.base_max_lr = max_lr  # first max learning rate
        self.max_lr = max_lr  # max learning rate in the current cycle
        self.min_lr = min_lr  # min learning rate
        self.warmup_steps = warmup_steps  # warmup step size
        self.gamma = gamma  # decrease rate of max learning rate by cycle

        self.cur_cycle_steps = first_cycle_steps  # first cycle step size
        self.cycle = 0  # cycle count
        self.step_in_cycle = last_epoch  # step size of the current cycle

        super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)

        # set learning rate min_lr
        self.init_lr()

    def init_lr(self):
        self.base_lrs = []
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.min_lr
            self.base_lrs.append(self.min_lr)

    def get_lr(self):
        if self.step_in_cycle == -1:
            return self.base_lrs
        elif self.step_in_cycle < self.warmup_steps:
            return [(self.max_lr - base_lr) * self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.max_lr - base_lr)
                    * (1 + math.cos(math.pi * (self.step_in_cycle - self.warmup_steps)
                                    / (self.cur_cycle_steps - self.warmup_steps))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.step_in_cycle = self.step_in_cycle + 1
            if self.step_in_cycle >= self.cur_cycle_steps:
                self.cycle += 1
                self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
                self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
        else:
            if epoch >= self.first_cycle_steps:
                if self.cycle_mult == 1.:
                    self.step_in_cycle = epoch % self.first_cycle_steps
                    self.cycle = epoch // self.first_cycle_steps
                else:
                    n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
                    self.cycle = n
                    self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
                    self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
            else:
                self.cur_cycle_steps = self.first_cycle_steps
                self.step_in_cycle = epoch

        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


# train_setup

In [4]:
import torch.nn.functional as f
import torch.nn as nn
import torch
import numpy as np
from transformers import AutoConfig, AutoModel


class _20SampleModel(nn.Module): # 이 새끼 dropout을 안때렸다,,,?
    def __init__(self, model_path):
        super(_20SampleModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_path)
        
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(config.hidden_size+1, 1) # for train_fts

    def forward(self, ids, mask, fts, labels=None):
        x = self.model(ids, mask)[0]
        x = torch.cat((x[:, 0, :], fts), 1)
        x = self.top(x)
        
        if labels is not None:
            loss = self.get_loss(x, labels)
            return loss, x
        else:
            return x

    def get_loss(self, preds, targets):
        loss_fct = nn.L1Loss()
        loss = loss_fct(preds, targets)
        return loss   

In [5]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

from transformers import AdamW

def train_setup(args):
    model = _20SampleModel(model_path=args.model_name_or_path)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_train_optimization_steps = args.num_train_steps
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=3e-5, correct_bias=False)
    scheduler = (
        CosineAnnealingWarmupRestarts(
            optimizer=optimizer,
            first_cycle_steps=num_train_optimization_steps,
            cycle_mult=1,
            max_lr=args.max_lr,
            min_lr=args.min_lr,
            warmup_steps=num_train_optimization_steps * 0.2,
            gamma=1.,
            last_epoch=-1
        ))  # Pytorch scheduler

    scaler = torch.cuda.amp.GradScaler()

    return model, optimizer, scheduler, scaler

/content/drive/MyDrive/NLP/ENG/ai4code/src2


# AWP

In [6]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        adv_param="weight",
        adv_lr=1,
        adv_eps=0.2,
        start_epoch=0,
        adv_step=1,
        scaler=None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, *inputs, target, epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save() 
        for i in range(self.adv_step):
            self._attack_step() 
            with torch.cuda.amp.autocast():
                adv_loss, tr_logits = self.model(*inputs, labels=target)
                adv_loss = adv_loss.mean()
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()
            
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

# train

In [7]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

from train import validate, read_data
from metrics import kendall_tau

import torch, sys, os
from tqdm import tqdm
import numpy as np


def train(model, train_loader, val_loader, optimizer, scheduler, scaler, val_df, df_orders, args):
    criterion = torch.nn.L1Loss()
    es = EarlyStopping(patience=4,max_epoch=args.epochs)
    awp = AWP(model,
              optimizer,
              adv_lr=1.,
              adv_eps=0.2,
            #   start_epoch=args.num_train_steps/args.epochs,
              start_epoch=0,
              scaler=scaler)
    preds_score = 0
    step = 0

    for e in range(args.epoch, 100):
        model.train()

        tbar = tqdm(train_loader, file=sys.stdout, position=0, leave=True)

        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            step += 1

            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred, loss = model(*inputs, labels=target)
            scaler.scale(loss).backward()

            # if preds_score > 0.82:
                # input_ids, labels, attention_mask
            awp.attack_backward(*inputs, target=target, epoch=step) 

            if idx % args.accumulation_steps == 0 or idx == len(tbar) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(
                f'Epoch {e+1} Loss: {avg_loss} lr: {scheduler.get_lr()}')

        y_val, y_pred = validate(model, val_loader)
        val_df['pred'] = val_df.groupby(['id', 'cell_type'])['rank'].rank(pct=True)
        val_df.loc[val_df['cell_type'] == 'markdown', 'pred'] = y_pred
        y_dummy = val_df.sort_values('pred').groupby('id')['cell_id'].apply(list)
        preds_score = kendall_tau(df_orders.loc[y_dummy.index], y_dummy)
        print("Preds score", preds_score)
        
        if not os.path.exists(args.output_path):
            os.mkdir(args.output_path)
            
        es(preds_score, model, model_path=args.output_path + f'/model_epoch_{e}_score.bin')
        if es.early_stop:
            break
        # torch.save(model.state_dict(), args.output_path + f'/model_epoch_{e}.bin')

/content/drive/MyDrive/NLP/ENG/ai4code/src2


# Main

In [8]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

import easydict
import gc
import pandas as pd

from preprocessor import PairwisePreprocessor, _20CodeCellPreprocessor
from dataset import _20SampleDataset, PairwiseDataset, _20sample_data_setup, pairwise_data_setup
from util import pairwise_debug_setup, _20sample_debug_setup


args = {
    'model_name_or_path': 'microsoft/graphcodebert-base',

    'input_path': '../input/',

    'train_path': './data/train.csv',
    'train_mark_path': './data/train_mark.csv',
    'train_features_path': './data/train_fts.json',

    'val_path': "./data/val.csv",
    'val_mark_path': './data/val_mark.csv',
    'val_features_path': './data/val_fts.json',

    'output_path': './output-graphcodebert-20sample-debug',

    'md_max_len': 64,
    'total_max_len': 512,
    'batch_size': 16,
    'accumulation_steps': 2,
    'epoch': 0,
    'epochs': 5,
    'n_workers': 8,
    'debug': True,
    'load_train': False,
    'max_lr': 3e-5,
    'min_lr': .3e-6,
    'kfold': True
}

args = easydict.EasyDict(args)

preprocessor = _20CodeCellPreprocessor(**vars(args))
train_df, val_df, train_df_mark, val_df_mark, train_fts, val_fts = preprocessor.run()

print('before debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

kfolds = []
if args.debug:
    for i in range(5):
        fold = _20sample_debug_setup(train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts)
        kfolds.append(fold)
    
train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[0]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[1]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[2]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[3]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[4]
print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

df_orders = pd.read_csv(args.input_path + 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()

/content/drive/MyDrive/NLP/ENG/ai4code/src2
train_df, val_df are already exits
train_fts, val_fts are already exists
before debug (5740832, 8) (629814, 8) (1950118, 8) (629814, 8) 125292 13964
after debug (579267, 8) (60771, 8) (196927, 8) (60771, 8) 12529 1396
after debug (572979, 8) (66500, 8) (194946, 8) (66500, 8) 12529 1396
after debug (571492, 8) (62541, 8) (195165, 8) (62541, 8) 12529 1396
after debug (573870, 8) (60704, 8) (194794, 8) (60704, 8) 12529 1396
after debug (573752, 8) (64702, 8) (195745, 8) (64702, 8) 12529 1396


In [9]:
for i in range(5):
    train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = kfolds[i]

    train_loader, val_loader = _20sample_data_setup(train_df_mark, val_df_mark, train_fts, val_fts, args)

    del train_df, train_df_mark, train_fts
    gc.collect()

    args.num_train_steps = args.epochs * len(train_loader) / args.accumulation_steps

    model, optimizer, scheduler, scaler = train_setup(args)
    model.cuda()

    train(model, train_loader, val_loader, optimizer, scheduler, scaler, val_df, df_orders, args)

    del model, optimizer, scheduler, scaler, val_fts, train_loader, val_loader
    gc.collect()

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to

  0%|          | 0/12307 [00:02<?, ?it/s]


RuntimeError: ignored