In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install -q transformers

!nvidia-smi

Mounted at /content/drive
[K     |████████████████████████████████| 4.4 MB 2.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 30.3 MB/s 
[K     |████████████████████████████████| 596 kB 76.5 MB/s 
[K     |████████████████████████████████| 101 kB 12.3 MB/s 
[?25hMon Jul 25 07:38:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                    

# args

In [10]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

import easydict
import gc
import pandas as pd

from preprocessor import PairwisePreprocessor, _20CodeCellPreprocessor
from dataset import _20SampleDataset, PairwiseDataset, _20sample_data_setup, pairwise_data_setup
from train import pairwise_train_setup, train_setup
from util import pairwise_debug_setup, _20sample_debug_setup
from metrics import kendall_tau
from train import train

args = {
    'model_name_or_path': 'microsoft/graphcodebert-base',

    'input_path': '../input/',

    'train_path': './data/train.csv',
    'train_mark_path': './data/train_mark.csv',
    'train_features_path': './data/train_fts.json',

    'val_path': "./data/val.csv",
    'val_mark_path': './data/val_mark.csv',
    'val_features_path': './data/val_fts.json',

    'output_path': './output-graphcodebert-20sample-es-dbg',

    'md_max_len': 64,
    'total_max_len': 512,
    'batch_size': 16,
    'accumulation_steps': 2,
    'epoch': 0,
    'epochs': 100,
    'n_workers': 8,
    'debug': True,
    'load_train': False,
    'max_lr': 3e-5,
    'min_lr': .3e-6
}

args = easydict.EasyDict(args)

/content/drive/MyDrive/NLP/ENG/ai4code/src2


# Class EarlyStopping

In [4]:
import copy

class EarlyStopping:
    def __init__(self, patience=6, mode="max", max_epoch=1e6, min_epoch=0, at_last_score=None):
        self.patience = patience
        self.mode = mode
        self.max_epoch = max_epoch
        self.min_epoch = min_epoch
        self.at_last_score = at_last_score if at_last_score is not None else -np.Inf 
        self.epoch = 0
        self.early_stop = False
        self.best_model = None
        self.best_epoch = 0
        self.model_path = None
        self.best_score = -np.Inf if self.mode == "max" else np.Inf

    def __call__(self, epoch_score, model=None, model_path=None):
        self.model_path = model_path
        self.epoch += 1

        score = -epoch_score if self.mode == "min" else epoch_score
        
        if score <= self.best_score: 
            counter = self.epoch - self.best_epoch
            print('EarlyStopping counter: {} out of {}'.format(counter, self.patience))
            if (counter >= self.patience) and (self.best_score > self.at_last_score) and (self.epoch >= self.min_epoch):
                self.early_stop = True 
                self._save_checkpoint()
        else:                    
            self.best_score = score 
            self.best_epoch = self.epoch
            self.best_model = copy.deepcopy(model).cpu()
        
        if self.max_epoch <= self.epoch:
            self.early_stop = True 
            self._save_checkpoint()

    def _save_checkpoint(self):
        if self.model_path is not None and self.best_model is not None:
            torch.save(self.best_model.state_dict(), self.model_path.replace('_score','_'+str(self.best_score)))
            print('model saved at: ',self.model_path.replace('_score','_'+str(self.best_score)))

# train

In [6]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

from train import validate, read_data

import torch, sys, os
from tqdm import tqdm
import numpy as np


def train(model, train_loader, val_loader, optimizer, scheduler, scaler, val_df, df_orders, args):
    criterion = torch.nn.L1Loss()
    es = EarlyStopping(patience=4,max_epoch=args.epochs)

    for e in range(args.epoch, args.epochs):
        model.train()

        tbar = tqdm(train_loader, file=sys.stdout, position=0, leave=True)

        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)
                loss = criterion(pred, target)
            scaler.scale(loss).backward()

            if idx % args.accumulation_steps == 0 or idx == len(tbar) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(
                f'Epoch {e+1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}')

        y_val, y_pred = validate(model, val_loader)
        val_df['pred'] = val_df.groupby(['id', 'cell_type'])['rank'].rank(pct=True)
        val_df.loc[val_df['cell_type'] == 'markdown', 'pred'] = y_pred
        y_dummy = val_df.sort_values('pred').groupby('id')['cell_id'].apply(list)
        preds_score = kendall_tau(df_orders.loc[y_dummy.index], y_dummy)
        print("Preds score", preds_score)
        
        if not os.path.exists(args.output_path):
            os.mkdir(args.output_path)
            
        es(preds_score, model, model_path=args.output_path + f'/model_epoch_{e}_score.bin')
        if es.early_stop:
            break
        # torch.save(model.state_dict(), args.output_path + f'/model_epoch_{e}.bin')

/content/drive/MyDrive/NLP/ENG/ai4code/src2


# Main

In [7]:
%cd /content/drive/MyDrive/NLP/ENG/ai4code/src2

preprocessor = _20CodeCellPreprocessor(**vars(args))
train_df, val_df, train_df_mark, val_df_mark, train_fts, val_fts = preprocessor.run()

print('before debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

if args.debug:
    train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts = \
     _20sample_debug_setup(train_df, train_df_mark, train_fts, val_df, val_df_mark, val_fts)

print('after debug', train_df.shape, val_df.shape, train_df_mark.shape, val_df.shape, len(train_fts), len(val_fts))

/content/drive/MyDrive/NLP/ENG/ai4code/src2
train_df, val_df are already exits
train_fts, val_fts are already exists
before debug (5740832, 8) (629814, 8) (1950118, 8) (629814, 8) 125292 13964
after debug (567792, 8) (63809, 8) (192281, 8) (63809, 8) 12529 1396


In [8]:
train_loader, val_loader = _20sample_data_setup(train_df_mark, val_df_mark, train_fts, val_fts, args)

df_orders = pd.read_csv(args.input_path + 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()

del preprocessor, train_df, train_df_mark, train_fts
gc.collect()

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

15

In [11]:
args.num_train_steps = args.epochs * len(train_loader) / args.accumulation_steps

model, optimizer, scheduler, scaler = train_setup(args)

model.cuda()

train(model, train_loader, val_loader, optimizer, scheduler, scaler, val_df, df_orders, args)

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to

Epoch 1 Loss: 0.4909 lr: [9.985853374386287e-10, 9.985853374386287e-10]:   0%|          | 1/12017 [00:00<3:09:03,  1.06it/s]



Epoch 1 Loss: 0.242 lr: [6.00049929266872e-06, 6.00049929266872e-06]: 100%|██████████| 12017/12017 [37:45<00:00,  5.30it/s]
100%|██████████| 1368/1368 [01:24<00:00, 16.11it/s]
Preds score 0.7570766117061613
Epoch 2 Loss: 0.1575 lr: [1.200099858533744e-05, 1.200099858533744e-05]: 100%|██████████| 12017/12017 [37:44<00:00,  5.31it/s]
100%|██████████| 1368/1368 [01:24<00:00, 16.10it/s]
Preds score 0.7896791351580534
Epoch 3 Loss: 0.1355 lr: [1.800149787800616e-05, 1.800149787800616e-05]: 100%|██████████| 12017/12017 [37:45<00:00,  5.30it/s]
100%|██████████| 1368/1368 [01:25<00:00, 16.08it/s]
Preds score 0.7980632787660007
Epoch 4 Loss: 0.1234 lr: [2.400199717067488e-05, 2.400199717067488e-05]: 100%|██████████| 12017/12017 [37:46<00:00,  5.30it/s]
100%|██████████| 1368/1368 [01:25<00:00, 16.00it/s]
Preds score 0.7993264791099737
Epoch 5 Loss: 0.1147 lr: [2.9999868607192444e-05, 2.9999868607192444e-05]: 100%|██████████| 12017/12017 [37:48<00:00,  5.30it/s]
100%|██████████| 1368/1368 [01:25<

KeyboardInterrupt: ignored