In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from project.data.data_loaders import get_clean_train_data
from sklearn.model_selection import GroupKFold
data = get_clean_train_data()
train_indices, test_indices = list(GroupKFold(n_splits=5).split(data['pn_history'], data['location'], data['pn_num']))[0]
train_data = data.iloc[train_indices]
valid_data = data.iloc[test_indices]

In [3]:
assert not train_data.pn_num.isin(valid_data.pn_num).any()
print(f'Train data: {train_data.pn_num.nunique()}, valid data: {valid_data.pn_num.nunique()}')

Train data: 800, valid data: 200


In [4]:
import ast
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths

In [5]:
 # Argument variables to be replaced later on 
 # Goal is to build everything up to the train loop itself

    # ====================================================
    # loader
    # ====================================================
# train_folds: = folds[folds['fold'] != fold].reset_index(drop=True)
# valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
valid_texts = valid_data['pn_history'].values
# valid_labels = create_labels_for_scoring(valid_data.reset_index(drop=True)) TODO: Validation for later

In [6]:
from project.experiments.theoviel_reproduction.reproduced_model import TrainDataset
train_dataset = TrainDataset(train_data)
valid_dataset = TrainDataset(valid_data)

In [7]:
from torch.utils.data import DataLoader
from project.experiments.theoviel_reproduction.reproduced_model import CFG
train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [8]:
from project.experiments.theoviel_reproduction.reproduced_model import DebertaCustomModel
# ====================================================
# model & optimizer
# ====================================================
    
model = DebertaCustomModel()
# torch.save(model.config, OUTPUT_DIR+'config.pth')
# model.to(device)


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
from torch.optim import AdamW
def get_optimizer_params(model: DebertaCustomModel, encoder_lr: float, decoder_lr: float, weight_decay=0.0):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.deberta_model().named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.deberta_model().named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

optimizer_parameters = get_optimizer_params(model,
                                            encoder_lr=CFG.encoder_lr, 
                                            decoder_lr=CFG.decoder_lr,
                                            weight_decay=CFG.weight_decay)
optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

In [10]:
from torch.optim import Optimizer
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

    # ====================================================
    # scheduler
    # ====================================================
def get_scheduler(cfg: CFG, optimizer: Optimizer, num_train_steps: int):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler
    
num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)


In [None]:
import torch.nn as nn
# ====================================================
# loop
# ====================================================
criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds